Per feed filters and scrubbing
This commit is contained in:
parent
88bdbe930e
commit
9d85882503
1
filters/stripAd/feedburner.sed
Normal file
1
filters/stripAd/feedburner.sed
Normal file
@ -0,0 +1 @@
|
||||
s|<p><a href="http://[a-zA-Z0-9\-\.]*/~a/[a-zA-Z0-9]*?a=[a-zA-Z0-9]*"><img border="0" src="http://[a-zA-Z0-9\.\-]*/~a/[a-zA-Z0-9/]*?i=[a-zA-Z0-9]*"/></a></p>||g
|
1
filters/stripAd/yahoo.sed
Normal file
1
filters/stripAd/yahoo.sed
Normal file
@ -0,0 +1 @@
|
||||
s|<p><!-- begin(Yahoo ad) -->.*<!-- end(Yahoo ad) --></p>||
|
@ -102,7 +102,6 @@ def __init__():
|
||||
define_planet_list('template_files')
|
||||
define_planet_list('bill_of_materials')
|
||||
define_planet_list('template_directories')
|
||||
define_planet_list('filters')
|
||||
define_planet_list('filter_directories')
|
||||
|
||||
# template options
|
||||
@ -111,6 +110,11 @@ def __init__():
|
||||
define_tmpl_int('activity_threshold', 0)
|
||||
define_tmpl('encoding', 'utf-8')
|
||||
define_tmpl('content_type', 'utf-8')
|
||||
define_tmpl('ignore_in_feed', '')
|
||||
define_tmpl('name_type', '')
|
||||
define_tmpl('title_type', '')
|
||||
define_tmpl('summary_type', '')
|
||||
define_tmpl('content_type', '')
|
||||
|
||||
def load(config_file):
|
||||
""" initialize and load a configuration"""
|
||||
@ -271,6 +275,14 @@ def reading_lists():
|
||||
result.append(section)
|
||||
return result
|
||||
|
||||
def filters(section=None):
|
||||
filters = []
|
||||
if parser.has_option('Planet', 'filters'):
|
||||
filters += parser.get('Planet', 'filters').split()
|
||||
if section and parser.has_option(section, 'filters'):
|
||||
filters += parser.get(section, 'filters').split()
|
||||
return filters
|
||||
|
||||
def planet_options():
|
||||
""" dictionary of planet wide options"""
|
||||
return dict(map(lambda opt: (opt, parser.get('Planet',opt)),
|
||||
|
19
planet/shell/sed.py
Normal file
19
planet/shell/sed.py
Normal file
@ -0,0 +1,19 @@
|
||||
from subprocess import Popen, PIPE
|
||||
|
||||
def run(script, doc, output_file=None, options={}):
|
||||
""" process an Python script """
|
||||
|
||||
if output_file:
|
||||
out = open(output_file, 'w')
|
||||
else:
|
||||
out = PIPE
|
||||
|
||||
proc = Popen(['sed', '-f', script],
|
||||
stdin=PIPE, stdout=out, stderr=PIPE)
|
||||
|
||||
stdout, stderr = proc.communicate(doc)
|
||||
if stderr:
|
||||
import planet
|
||||
planet.logger.error(stderr)
|
||||
|
||||
return stdout
|
@ -42,6 +42,61 @@ def write(xdoc, out):
|
||||
file.write(xdoc)
|
||||
file.close()
|
||||
|
||||
type_map = {'text': 'text/plain', 'html': 'text/html',
|
||||
'xhtml': 'application/xhtml+xml'}
|
||||
|
||||
def scrub(feed, data):
|
||||
|
||||
# some data is not trustworthy
|
||||
for tag in config.ignore_in_feed(feed).split():
|
||||
for entry in data.entries:
|
||||
if entry.has_key(tag): del entry[tag]
|
||||
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
|
||||
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
|
||||
|
||||
# adjust title types
|
||||
if config.title_type(feed):
|
||||
title_type = config.title_type(feed)
|
||||
title_type = type_map.get(title_type, title_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('title_detail'):
|
||||
entry.title_detail['type'] = title_type
|
||||
|
||||
# adjust summary types
|
||||
if config.summary_type(feed):
|
||||
summary_type = config.summary_type(feed)
|
||||
summary_type = type_map.get(summary_type, summary_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('summary_detail'):
|
||||
entry.summary_detail['type'] = summary_type
|
||||
|
||||
# adjust content types
|
||||
if config.content_type(feed):
|
||||
content_type = config.content_type(feed)
|
||||
content_type = type_map.get(content_type, content_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('content'):
|
||||
entry.content[0]['type'] = content_type
|
||||
|
||||
# some people put html in author names
|
||||
if config.name_type(feed).find('html')>=0:
|
||||
from planet.shell.tmpl import stripHtml
|
||||
if data.feed.has_key('author_detail') and \
|
||||
data.feed.author_detail.has_key('name'):
|
||||
data.feed.author_detail['name'] = \
|
||||
str(stripHtml(data.feed.author_detail.name))
|
||||
for entry in data.entries:
|
||||
if entry.has_key('author_detail') and \
|
||||
entry.author_detail.has_key('name'):
|
||||
entry.author_detail['name'] = \
|
||||
str(stripHtml(entry.author_detail.name))
|
||||
if entry.has_key('source'):
|
||||
source = entry.source
|
||||
if source.has_key('author_detail') and \
|
||||
source.author_detail.has_key('name'):
|
||||
source.author_detail['name'] = \
|
||||
str(stripHtml(source.author_detail.name))
|
||||
|
||||
def spiderFeed(feed):
|
||||
""" Spider (fetch) a single feed """
|
||||
|
||||
@ -136,6 +191,9 @@ def spiderFeed(feed):
|
||||
elif data.status >= 400:
|
||||
data.feed['planet_message'] = "http status %s" % status
|
||||
|
||||
# perform user configured scrub operations on the data
|
||||
scrub(feed, data)
|
||||
|
||||
# write the feed info to the cache
|
||||
if not os.path.exists(sources): os.makedirs(sources)
|
||||
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
|
||||
@ -147,7 +205,6 @@ def spiderFeed(feed):
|
||||
# write each entry to the cache
|
||||
cache = config.cache_directory()
|
||||
for entry in data.entries:
|
||||
|
||||
# generate an id, if none is present
|
||||
if not entry.has_key('id') or not entry.id:
|
||||
entry['id'] = reconstitute.id(None, entry)
|
||||
@ -158,6 +215,9 @@ def spiderFeed(feed):
|
||||
|
||||
# get updated-date either from the entry or the cache (default to now)
|
||||
mtime = None
|
||||
if not entry.has_key('updated_parsed'):
|
||||
if entry.has_key('published_parsed'):
|
||||
entry['updated_parsed'] = entry.published_parsed
|
||||
if entry.has_key('updated_parsed'):
|
||||
mtime = calendar.timegm(entry.updated_parsed)
|
||||
if mtime > time.time(): mtime = None
|
||||
@ -172,7 +232,7 @@ def spiderFeed(feed):
|
||||
xdoc = reconstitute.reconstitute(data, entry)
|
||||
output = xdoc.toxml('utf-8')
|
||||
xdoc.unlink()
|
||||
for filter in config.filters():
|
||||
for filter in config.filters(feed):
|
||||
output = shell.run(filter, output, mode="filter")
|
||||
if not output: return
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
name = Test Configuration
|
||||
template_files = index.html.tmpl atom.xml.tmpl
|
||||
items_per_page = 50
|
||||
filters = foo
|
||||
|
||||
[index.html.tmpl]
|
||||
days_per_page = 7
|
||||
@ -11,3 +12,4 @@ name = one
|
||||
|
||||
[feed2]
|
||||
name = two
|
||||
filters = bar
|
||||
|
2
tests/data/filter/stripAd-yahoo.ini
Normal file
2
tests/data/filter/stripAd-yahoo.ini
Normal file
@ -0,0 +1,2 @@
|
||||
[Planet]
|
||||
filters = stripAd/yahoo.sed
|
4
tests/data/filter/stripAd-yahoo.xml
Normal file
4
tests/data/filter/stripAd-yahoo.xml
Normal file
@ -0,0 +1,4 @@
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<content><div xmlns="http://wwww.w3.org/1999/xhtml">before-<p><!-- begin(Yahoo ad) -->ad content here<!-- end(Yahoo ad) --></p>-after</div></content>
|
||||
</entry>
|
||||
|
@ -50,3 +50,7 @@ class ConfigTest(unittest.TestCase):
|
||||
option = config.template_options('index.html.tmpl')
|
||||
self.assertEqual('7', option['days_per_page'])
|
||||
self.assertEqual('50', option['items_per_page'])
|
||||
|
||||
def test_filters(self):
|
||||
self.assertEqual(['foo','bar'], config.filters('feed2'))
|
||||
self.assertEqual(['foo'], config.filters('feed1'))
|
||||
|
@ -44,3 +44,25 @@ class FilterTests(unittest.TestCase):
|
||||
self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
|
||||
u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
|
||||
u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
|
||||
|
||||
def test_stripAd_yahoo(self):
|
||||
testfile = 'tests/data/filter/stripAd-yahoo.xml'
|
||||
config.load('tests/data/filter/stripAd-yahoo.ini')
|
||||
|
||||
output = open(testfile).read()
|
||||
for filter in config.filters():
|
||||
output = shell.run(filter, output, mode="filter")
|
||||
|
||||
dom = xml.dom.minidom.parseString(output)
|
||||
excerpt = dom.getElementsByTagName('content')[0]
|
||||
self.assertEqual(u'before--after',
|
||||
excerpt.firstChild.firstChild.nodeValue)
|
||||
|
||||
try:
|
||||
from subprocess import Popen, PIPE
|
||||
sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
|
||||
sed.communicate()
|
||||
if sed.returncode != 0: raise Exception
|
||||
except:
|
||||
# sed is not available
|
||||
del FilterTests.test_stripAd_yahoo
|
||||
|
57
tests/test_scrub.py
Normal file
57
tests/test_scrub.py
Normal file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import unittest, StringIO
|
||||
from planet.spider import scrub
|
||||
from planet import feedparser, config
|
||||
|
||||
feed = '''
|
||||
<feed xmlns='http://www.w3.org/2005/Atom'>
|
||||
<author><name>F&ouml;o</name></author>
|
||||
<entry>
|
||||
<id>ignoreme</id>
|
||||
<author><name>F&ouml;o</name></author>
|
||||
<updated>2000-01-01T00:00:00Z</updated>
|
||||
<title>F&ouml;o</title>
|
||||
<summary>F&ouml;o</summary>
|
||||
<content>F&ouml;o</content>
|
||||
<source>
|
||||
<author><name>F&ouml;o</name></author>
|
||||
</source>
|
||||
</entry>
|
||||
</feed>
|
||||
'''
|
||||
|
||||
configData = '''
|
||||
[testfeed]
|
||||
ignore_in_feed = id updated
|
||||
name_type = html
|
||||
title_type = html
|
||||
summary_type = html
|
||||
content_type = html
|
||||
'''
|
||||
|
||||
class ScrubTest(unittest.TestCase):
|
||||
|
||||
def test_scrub(self):
|
||||
data = feedparser.parse(feed)
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
|
||||
self.assertEqual('Föo', data.feed.author_detail.name)
|
||||
self.assertTrue(data.entries[0].has_key('id'))
|
||||
self.assertTrue(data.entries[0].has_key('updated'))
|
||||
self.assertTrue(data.entries[0].has_key('updated_parsed'))
|
||||
|
||||
scrub('testfeed', data)
|
||||
|
||||
self.assertFalse(data.entries[0].has_key('id'))
|
||||
self.assertFalse(data.entries[0].has_key('updated'))
|
||||
self.assertFalse(data.entries[0].has_key('updated_parsed'))
|
||||
|
||||
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
|
||||
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
|
||||
self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
|
||||
|
||||
self.assertEqual('text/html', data.entries[0].title_detail.type)
|
||||
self.assertEqual('text/html', data.entries[0].summary_detail.type)
|
||||
self.assertEqual('text/html', data.entries[0].content[0].type)
|
||||
|
Loading…
x
Reference in New Issue
Block a user