Per feed filters and scrubbing
This commit is contained in:
parent
88bdbe930e
commit
9d85882503
1
filters/stripAd/feedburner.sed
Normal file
1
filters/stripAd/feedburner.sed
Normal file
@ -0,0 +1 @@
|
|||||||
|
s|<p><a href="http://[a-zA-Z0-9\-\.]*/~a/[a-zA-Z0-9]*?a=[a-zA-Z0-9]*"><img border="0" src="http://[a-zA-Z0-9\.\-]*/~a/[a-zA-Z0-9/]*?i=[a-zA-Z0-9]*"/></a></p>||g
|
1
filters/stripAd/yahoo.sed
Normal file
1
filters/stripAd/yahoo.sed
Normal file
@ -0,0 +1 @@
|
|||||||
|
s|<p><!-- begin(Yahoo ad) -->.*<!-- end(Yahoo ad) --></p>||
|
@ -102,7 +102,6 @@ def __init__():
|
|||||||
define_planet_list('template_files')
|
define_planet_list('template_files')
|
||||||
define_planet_list('bill_of_materials')
|
define_planet_list('bill_of_materials')
|
||||||
define_planet_list('template_directories')
|
define_planet_list('template_directories')
|
||||||
define_planet_list('filters')
|
|
||||||
define_planet_list('filter_directories')
|
define_planet_list('filter_directories')
|
||||||
|
|
||||||
# template options
|
# template options
|
||||||
@ -111,6 +110,11 @@ def __init__():
|
|||||||
define_tmpl_int('activity_threshold', 0)
|
define_tmpl_int('activity_threshold', 0)
|
||||||
define_tmpl('encoding', 'utf-8')
|
define_tmpl('encoding', 'utf-8')
|
||||||
define_tmpl('content_type', 'utf-8')
|
define_tmpl('content_type', 'utf-8')
|
||||||
|
define_tmpl('ignore_in_feed', '')
|
||||||
|
define_tmpl('name_type', '')
|
||||||
|
define_tmpl('title_type', '')
|
||||||
|
define_tmpl('summary_type', '')
|
||||||
|
define_tmpl('content_type', '')
|
||||||
|
|
||||||
def load(config_file):
|
def load(config_file):
|
||||||
""" initialize and load a configuration"""
|
""" initialize and load a configuration"""
|
||||||
@ -271,6 +275,14 @@ def reading_lists():
|
|||||||
result.append(section)
|
result.append(section)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def filters(section=None):
|
||||||
|
filters = []
|
||||||
|
if parser.has_option('Planet', 'filters'):
|
||||||
|
filters += parser.get('Planet', 'filters').split()
|
||||||
|
if section and parser.has_option(section, 'filters'):
|
||||||
|
filters += parser.get(section, 'filters').split()
|
||||||
|
return filters
|
||||||
|
|
||||||
def planet_options():
|
def planet_options():
|
||||||
""" dictionary of planet wide options"""
|
""" dictionary of planet wide options"""
|
||||||
return dict(map(lambda opt: (opt, parser.get('Planet',opt)),
|
return dict(map(lambda opt: (opt, parser.get('Planet',opt)),
|
||||||
|
19
planet/shell/sed.py
Normal file
19
planet/shell/sed.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from subprocess import Popen, PIPE
|
||||||
|
|
||||||
|
def run(script, doc, output_file=None, options={}):
|
||||||
|
""" process an Python script """
|
||||||
|
|
||||||
|
if output_file:
|
||||||
|
out = open(output_file, 'w')
|
||||||
|
else:
|
||||||
|
out = PIPE
|
||||||
|
|
||||||
|
proc = Popen(['sed', '-f', script],
|
||||||
|
stdin=PIPE, stdout=out, stderr=PIPE)
|
||||||
|
|
||||||
|
stdout, stderr = proc.communicate(doc)
|
||||||
|
if stderr:
|
||||||
|
import planet
|
||||||
|
planet.logger.error(stderr)
|
||||||
|
|
||||||
|
return stdout
|
@ -42,6 +42,61 @@ def write(xdoc, out):
|
|||||||
file.write(xdoc)
|
file.write(xdoc)
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
|
type_map = {'text': 'text/plain', 'html': 'text/html',
|
||||||
|
'xhtml': 'application/xhtml+xml'}
|
||||||
|
|
||||||
|
def scrub(feed, data):
|
||||||
|
|
||||||
|
# some data is not trustworthy
|
||||||
|
for tag in config.ignore_in_feed(feed).split():
|
||||||
|
for entry in data.entries:
|
||||||
|
if entry.has_key(tag): del entry[tag]
|
||||||
|
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
|
||||||
|
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
|
||||||
|
|
||||||
|
# adjust title types
|
||||||
|
if config.title_type(feed):
|
||||||
|
title_type = config.title_type(feed)
|
||||||
|
title_type = type_map.get(title_type, title_type)
|
||||||
|
for entry in data.entries:
|
||||||
|
if entry.has_key('title_detail'):
|
||||||
|
entry.title_detail['type'] = title_type
|
||||||
|
|
||||||
|
# adjust summary types
|
||||||
|
if config.summary_type(feed):
|
||||||
|
summary_type = config.summary_type(feed)
|
||||||
|
summary_type = type_map.get(summary_type, summary_type)
|
||||||
|
for entry in data.entries:
|
||||||
|
if entry.has_key('summary_detail'):
|
||||||
|
entry.summary_detail['type'] = summary_type
|
||||||
|
|
||||||
|
# adjust content types
|
||||||
|
if config.content_type(feed):
|
||||||
|
content_type = config.content_type(feed)
|
||||||
|
content_type = type_map.get(content_type, content_type)
|
||||||
|
for entry in data.entries:
|
||||||
|
if entry.has_key('content'):
|
||||||
|
entry.content[0]['type'] = content_type
|
||||||
|
|
||||||
|
# some people put html in author names
|
||||||
|
if config.name_type(feed).find('html')>=0:
|
||||||
|
from planet.shell.tmpl import stripHtml
|
||||||
|
if data.feed.has_key('author_detail') and \
|
||||||
|
data.feed.author_detail.has_key('name'):
|
||||||
|
data.feed.author_detail['name'] = \
|
||||||
|
str(stripHtml(data.feed.author_detail.name))
|
||||||
|
for entry in data.entries:
|
||||||
|
if entry.has_key('author_detail') and \
|
||||||
|
entry.author_detail.has_key('name'):
|
||||||
|
entry.author_detail['name'] = \
|
||||||
|
str(stripHtml(entry.author_detail.name))
|
||||||
|
if entry.has_key('source'):
|
||||||
|
source = entry.source
|
||||||
|
if source.has_key('author_detail') and \
|
||||||
|
source.author_detail.has_key('name'):
|
||||||
|
source.author_detail['name'] = \
|
||||||
|
str(stripHtml(source.author_detail.name))
|
||||||
|
|
||||||
def spiderFeed(feed):
|
def spiderFeed(feed):
|
||||||
""" Spider (fetch) a single feed """
|
""" Spider (fetch) a single feed """
|
||||||
|
|
||||||
@ -136,6 +191,9 @@ def spiderFeed(feed):
|
|||||||
elif data.status >= 400:
|
elif data.status >= 400:
|
||||||
data.feed['planet_message'] = "http status %s" % status
|
data.feed['planet_message'] = "http status %s" % status
|
||||||
|
|
||||||
|
# perform user configured scrub operations on the data
|
||||||
|
scrub(feed, data)
|
||||||
|
|
||||||
# write the feed info to the cache
|
# write the feed info to the cache
|
||||||
if not os.path.exists(sources): os.makedirs(sources)
|
if not os.path.exists(sources): os.makedirs(sources)
|
||||||
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
|
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
|
||||||
@ -147,7 +205,6 @@ def spiderFeed(feed):
|
|||||||
# write each entry to the cache
|
# write each entry to the cache
|
||||||
cache = config.cache_directory()
|
cache = config.cache_directory()
|
||||||
for entry in data.entries:
|
for entry in data.entries:
|
||||||
|
|
||||||
# generate an id, if none is present
|
# generate an id, if none is present
|
||||||
if not entry.has_key('id') or not entry.id:
|
if not entry.has_key('id') or not entry.id:
|
||||||
entry['id'] = reconstitute.id(None, entry)
|
entry['id'] = reconstitute.id(None, entry)
|
||||||
@ -158,6 +215,9 @@ def spiderFeed(feed):
|
|||||||
|
|
||||||
# get updated-date either from the entry or the cache (default to now)
|
# get updated-date either from the entry or the cache (default to now)
|
||||||
mtime = None
|
mtime = None
|
||||||
|
if not entry.has_key('updated_parsed'):
|
||||||
|
if entry.has_key('published_parsed'):
|
||||||
|
entry['updated_parsed'] = entry.published_parsed
|
||||||
if entry.has_key('updated_parsed'):
|
if entry.has_key('updated_parsed'):
|
||||||
mtime = calendar.timegm(entry.updated_parsed)
|
mtime = calendar.timegm(entry.updated_parsed)
|
||||||
if mtime > time.time(): mtime = None
|
if mtime > time.time(): mtime = None
|
||||||
@ -172,7 +232,7 @@ def spiderFeed(feed):
|
|||||||
xdoc = reconstitute.reconstitute(data, entry)
|
xdoc = reconstitute.reconstitute(data, entry)
|
||||||
output = xdoc.toxml('utf-8')
|
output = xdoc.toxml('utf-8')
|
||||||
xdoc.unlink()
|
xdoc.unlink()
|
||||||
for filter in config.filters():
|
for filter in config.filters(feed):
|
||||||
output = shell.run(filter, output, mode="filter")
|
output = shell.run(filter, output, mode="filter")
|
||||||
if not output: return
|
if not output: return
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
name = Test Configuration
|
name = Test Configuration
|
||||||
template_files = index.html.tmpl atom.xml.tmpl
|
template_files = index.html.tmpl atom.xml.tmpl
|
||||||
items_per_page = 50
|
items_per_page = 50
|
||||||
|
filters = foo
|
||||||
|
|
||||||
[index.html.tmpl]
|
[index.html.tmpl]
|
||||||
days_per_page = 7
|
days_per_page = 7
|
||||||
@ -11,3 +12,4 @@ name = one
|
|||||||
|
|
||||||
[feed2]
|
[feed2]
|
||||||
name = two
|
name = two
|
||||||
|
filters = bar
|
||||||
|
2
tests/data/filter/stripAd-yahoo.ini
Normal file
2
tests/data/filter/stripAd-yahoo.ini
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
[Planet]
|
||||||
|
filters = stripAd/yahoo.sed
|
4
tests/data/filter/stripAd-yahoo.xml
Normal file
4
tests/data/filter/stripAd-yahoo.xml
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<content><div xmlns="http://wwww.w3.org/1999/xhtml">before-<p><!-- begin(Yahoo ad) -->ad content here<!-- end(Yahoo ad) --></p>-after</div></content>
|
||||||
|
</entry>
|
||||||
|
|
@ -50,3 +50,7 @@ class ConfigTest(unittest.TestCase):
|
|||||||
option = config.template_options('index.html.tmpl')
|
option = config.template_options('index.html.tmpl')
|
||||||
self.assertEqual('7', option['days_per_page'])
|
self.assertEqual('7', option['days_per_page'])
|
||||||
self.assertEqual('50', option['items_per_page'])
|
self.assertEqual('50', option['items_per_page'])
|
||||||
|
|
||||||
|
def test_filters(self):
|
||||||
|
self.assertEqual(['foo','bar'], config.filters('feed2'))
|
||||||
|
self.assertEqual(['foo'], config.filters('feed1'))
|
||||||
|
@ -44,3 +44,25 @@ class FilterTests(unittest.TestCase):
|
|||||||
self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
|
self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
|
||||||
u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
|
u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
|
||||||
u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
|
u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
|
||||||
|
|
||||||
|
def test_stripAd_yahoo(self):
|
||||||
|
testfile = 'tests/data/filter/stripAd-yahoo.xml'
|
||||||
|
config.load('tests/data/filter/stripAd-yahoo.ini')
|
||||||
|
|
||||||
|
output = open(testfile).read()
|
||||||
|
for filter in config.filters():
|
||||||
|
output = shell.run(filter, output, mode="filter")
|
||||||
|
|
||||||
|
dom = xml.dom.minidom.parseString(output)
|
||||||
|
excerpt = dom.getElementsByTagName('content')[0]
|
||||||
|
self.assertEqual(u'before--after',
|
||||||
|
excerpt.firstChild.firstChild.nodeValue)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from subprocess import Popen, PIPE
|
||||||
|
sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
|
||||||
|
sed.communicate()
|
||||||
|
if sed.returncode != 0: raise Exception
|
||||||
|
except:
|
||||||
|
# sed is not available
|
||||||
|
del FilterTests.test_stripAd_yahoo
|
||||||
|
57
tests/test_scrub.py
Normal file
57
tests/test_scrub.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import unittest, StringIO
|
||||||
|
from planet.spider import scrub
|
||||||
|
from planet import feedparser, config
|
||||||
|
|
||||||
|
feed = '''
|
||||||
|
<feed xmlns='http://www.w3.org/2005/Atom'>
|
||||||
|
<author><name>F&ouml;o</name></author>
|
||||||
|
<entry>
|
||||||
|
<id>ignoreme</id>
|
||||||
|
<author><name>F&ouml;o</name></author>
|
||||||
|
<updated>2000-01-01T00:00:00Z</updated>
|
||||||
|
<title>F&ouml;o</title>
|
||||||
|
<summary>F&ouml;o</summary>
|
||||||
|
<content>F&ouml;o</content>
|
||||||
|
<source>
|
||||||
|
<author><name>F&ouml;o</name></author>
|
||||||
|
</source>
|
||||||
|
</entry>
|
||||||
|
</feed>
|
||||||
|
'''
|
||||||
|
|
||||||
|
configData = '''
|
||||||
|
[testfeed]
|
||||||
|
ignore_in_feed = id updated
|
||||||
|
name_type = html
|
||||||
|
title_type = html
|
||||||
|
summary_type = html
|
||||||
|
content_type = html
|
||||||
|
'''
|
||||||
|
|
||||||
|
class ScrubTest(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_scrub(self):
|
||||||
|
data = feedparser.parse(feed)
|
||||||
|
config.parser.readfp(StringIO.StringIO(configData))
|
||||||
|
|
||||||
|
self.assertEqual('Föo', data.feed.author_detail.name)
|
||||||
|
self.assertTrue(data.entries[0].has_key('id'))
|
||||||
|
self.assertTrue(data.entries[0].has_key('updated'))
|
||||||
|
self.assertTrue(data.entries[0].has_key('updated_parsed'))
|
||||||
|
|
||||||
|
scrub('testfeed', data)
|
||||||
|
|
||||||
|
self.assertFalse(data.entries[0].has_key('id'))
|
||||||
|
self.assertFalse(data.entries[0].has_key('updated'))
|
||||||
|
self.assertFalse(data.entries[0].has_key('updated_parsed'))
|
||||||
|
|
||||||
|
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
|
||||||
|
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
|
||||||
|
self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
|
||||||
|
|
||||||
|
self.assertEqual('text/html', data.entries[0].title_detail.type)
|
||||||
|
self.assertEqual('text/html', data.entries[0].summary_detail.type)
|
||||||
|
self.assertEqual('text/html', data.entries[0].content[0].type)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user