From 9d858825030544ba04b4afe0e69e99093147f282 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Mon, 4 Sep 2006 07:14:41 -0400 Subject: [PATCH] Per feed filters and scrubbing --- filters/stripAd/feedburner.sed | 1 + filters/stripAd/yahoo.sed | 1 + planet/config.py | 14 ++++++- planet/shell/sed.py | 19 +++++++++ planet/spider.py | 64 ++++++++++++++++++++++++++++- tests/data/config/basic.ini | 2 + tests/data/filter/stripAd-yahoo.ini | 2 + tests/data/filter/stripAd-yahoo.xml | 4 ++ tests/test_config.py | 4 ++ tests/test_filters.py | 22 ++++++++++ tests/test_scrub.py | 57 +++++++++++++++++++++++++ 11 files changed, 187 insertions(+), 3 deletions(-) create mode 100644 filters/stripAd/feedburner.sed create mode 100644 filters/stripAd/yahoo.sed create mode 100644 planet/shell/sed.py create mode 100644 tests/data/filter/stripAd-yahoo.ini create mode 100644 tests/data/filter/stripAd-yahoo.xml create mode 100644 tests/test_scrub.py diff --git a/filters/stripAd/feedburner.sed b/filters/stripAd/feedburner.sed new file mode 100644 index 0000000..d203ccd --- /dev/null +++ b/filters/stripAd/feedburner.sed @@ -0,0 +1 @@ +s|

||g diff --git a/filters/stripAd/yahoo.sed b/filters/stripAd/yahoo.sed new file mode 100644 index 0000000..03cd9dd --- /dev/null +++ b/filters/stripAd/yahoo.sed @@ -0,0 +1 @@ +s|

.*

|| diff --git a/planet/config.py b/planet/config.py index 7438598..a63f5ad 100644 --- a/planet/config.py +++ b/planet/config.py @@ -102,7 +102,6 @@ def __init__(): define_planet_list('template_files') define_planet_list('bill_of_materials') define_planet_list('template_directories') - define_planet_list('filters') define_planet_list('filter_directories') # template options @@ -111,6 +110,11 @@ def __init__(): define_tmpl_int('activity_threshold', 0) define_tmpl('encoding', 'utf-8') define_tmpl('content_type', 'utf-8') + define_tmpl('ignore_in_feed', '') + define_tmpl('name_type', '') + define_tmpl('title_type', '') + define_tmpl('summary_type', '') + define_tmpl('content_type', '') def load(config_file): """ initialize and load a configuration""" @@ -271,6 +275,14 @@ def reading_lists(): result.append(section) return result +def filters(section=None): + filters = [] + if parser.has_option('Planet', 'filters'): + filters += parser.get('Planet', 'filters').split() + if section and parser.has_option(section, 'filters'): + filters += parser.get(section, 'filters').split() + return filters + def planet_options(): """ dictionary of planet wide options""" return dict(map(lambda opt: (opt, parser.get('Planet',opt)), diff --git a/planet/shell/sed.py b/planet/shell/sed.py new file mode 100644 index 0000000..06082a7 --- /dev/null +++ b/planet/shell/sed.py @@ -0,0 +1,19 @@ +from subprocess import Popen, PIPE + +def run(script, doc, output_file=None, options={}): + """ process an Python script """ + + if output_file: + out = open(output_file, 'w') + else: + out = PIPE + + proc = Popen(['sed', '-f', script], + stdin=PIPE, stdout=out, stderr=PIPE) + + stdout, stderr = proc.communicate(doc) + if stderr: + import planet + planet.logger.error(stderr) + + return stdout diff --git a/planet/spider.py b/planet/spider.py index 7ee0054..97658c9 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -42,6 +42,61 @@ def write(xdoc, out): file.write(xdoc) file.close() +type_map = {'text': 'text/plain', 'html': 'text/html', + 'xhtml': 'application/xhtml+xml'} + +def scrub(feed, data): + + # some data is not trustworthy + for tag in config.ignore_in_feed(feed).split(): + for entry in data.entries: + if entry.has_key(tag): del entry[tag] + if entry.has_key(tag + "_detail"): del entry[tag + "_detail"] + if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"] + + # adjust title types + if config.title_type(feed): + title_type = config.title_type(feed) + title_type = type_map.get(title_type, title_type) + for entry in data.entries: + if entry.has_key('title_detail'): + entry.title_detail['type'] = title_type + + # adjust summary types + if config.summary_type(feed): + summary_type = config.summary_type(feed) + summary_type = type_map.get(summary_type, summary_type) + for entry in data.entries: + if entry.has_key('summary_detail'): + entry.summary_detail['type'] = summary_type + + # adjust content types + if config.content_type(feed): + content_type = config.content_type(feed) + content_type = type_map.get(content_type, content_type) + for entry in data.entries: + if entry.has_key('content'): + entry.content[0]['type'] = content_type + + # some people put html in author names + if config.name_type(feed).find('html')>=0: + from planet.shell.tmpl import stripHtml + if data.feed.has_key('author_detail') and \ + data.feed.author_detail.has_key('name'): + data.feed.author_detail['name'] = \ + str(stripHtml(data.feed.author_detail.name)) + for entry in data.entries: + if entry.has_key('author_detail') and \ + entry.author_detail.has_key('name'): + entry.author_detail['name'] = \ + str(stripHtml(entry.author_detail.name)) + if entry.has_key('source'): + source = entry.source + if source.has_key('author_detail') and \ + source.author_detail.has_key('name'): + source.author_detail['name'] = \ + str(stripHtml(source.author_detail.name)) + def spiderFeed(feed): """ Spider (fetch) a single feed """ @@ -136,6 +191,9 @@ def spiderFeed(feed): elif data.status >= 400: data.feed['planet_message'] = "http status %s" % status + # perform user configured scrub operations on the data + scrub(feed, data) + # write the feed info to the cache if not os.path.exists(sources): os.makedirs(sources) xdoc=minidom.parseString(''' time.time(): mtime = None @@ -172,7 +232,7 @@ def spiderFeed(feed): xdoc = reconstitute.reconstitute(data, entry) output = xdoc.toxml('utf-8') xdoc.unlink() - for filter in config.filters(): + for filter in config.filters(feed): output = shell.run(filter, output, mode="filter") if not output: return diff --git a/tests/data/config/basic.ini b/tests/data/config/basic.ini index 022f3ba..1ccaf42 100644 --- a/tests/data/config/basic.ini +++ b/tests/data/config/basic.ini @@ -2,6 +2,7 @@ name = Test Configuration template_files = index.html.tmpl atom.xml.tmpl items_per_page = 50 +filters = foo [index.html.tmpl] days_per_page = 7 @@ -11,3 +12,4 @@ name = one [feed2] name = two +filters = bar diff --git a/tests/data/filter/stripAd-yahoo.ini b/tests/data/filter/stripAd-yahoo.ini new file mode 100644 index 0000000..c348c73 --- /dev/null +++ b/tests/data/filter/stripAd-yahoo.ini @@ -0,0 +1,2 @@ +[Planet] +filters = stripAd/yahoo.sed diff --git a/tests/data/filter/stripAd-yahoo.xml b/tests/data/filter/stripAd-yahoo.xml new file mode 100644 index 0000000..68fb691 --- /dev/null +++ b/tests/data/filter/stripAd-yahoo.xml @@ -0,0 +1,4 @@ + +
before-

ad content here

-after
+
+ diff --git a/tests/test_config.py b/tests/test_config.py index 14c0239..5b8f1e1 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -50,3 +50,7 @@ class ConfigTest(unittest.TestCase): option = config.template_options('index.html.tmpl') self.assertEqual('7', option['days_per_page']) self.assertEqual('50', option['items_per_page']) + + def test_filters(self): + self.assertEqual(['foo','bar'], config.filters('feed2')) + self.assertEqual(['foo'], config.filters('feed1')) diff --git a/tests/test_filters.py b/tests/test_filters.py index 7f7a433..14aafe9 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -44,3 +44,25 @@ class FilterTests(unittest.TestCase): self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' + u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' + u'in \u2026', excerpt.firstChild.firstChild.nodeValue) + + def test_stripAd_yahoo(self): + testfile = 'tests/data/filter/stripAd-yahoo.xml' + config.load('tests/data/filter/stripAd-yahoo.ini') + + output = open(testfile).read() + for filter in config.filters(): + output = shell.run(filter, output, mode="filter") + + dom = xml.dom.minidom.parseString(output) + excerpt = dom.getElementsByTagName('content')[0] + self.assertEqual(u'before--after', + excerpt.firstChild.firstChild.nodeValue) + +try: + from subprocess import Popen, PIPE + sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE) + sed.communicate() + if sed.returncode != 0: raise Exception +except: + # sed is not available + del FilterTests.test_stripAd_yahoo diff --git a/tests/test_scrub.py b/tests/test_scrub.py new file mode 100644 index 0000000..dc94e05 --- /dev/null +++ b/tests/test_scrub.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +import unittest, StringIO +from planet.spider import scrub +from planet import feedparser, config + +feed = ''' + + Föo + + ignoreme + Föo + 2000-01-01T00:00:00Z + Föo + Föo + Föo + + Föo + + + +''' + +configData = ''' +[testfeed] +ignore_in_feed = id updated +name_type = html +title_type = html +summary_type = html +content_type = html +''' + +class ScrubTest(unittest.TestCase): + + def test_scrub(self): + data = feedparser.parse(feed) + config.parser.readfp(StringIO.StringIO(configData)) + + self.assertEqual('Föo', data.feed.author_detail.name) + self.assertTrue(data.entries[0].has_key('id')) + self.assertTrue(data.entries[0].has_key('updated')) + self.assertTrue(data.entries[0].has_key('updated_parsed')) + + scrub('testfeed', data) + + self.assertFalse(data.entries[0].has_key('id')) + self.assertFalse(data.entries[0].has_key('updated')) + self.assertFalse(data.entries[0].has_key('updated_parsed')) + + self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name) + self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name) + self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name) + + self.assertEqual('text/html', data.entries[0].title_detail.type) + self.assertEqual('text/html', data.entries[0].summary_detail.type) + self.assertEqual('text/html', data.entries[0].content[0].type) +