From 9d858825030544ba04b4afe0e69e99093147f282 Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@rubypad>
Date: Mon, 4 Sep 2006 07:14:41 -0400
Subject: [PATCH] Per feed filters and scrubbing

---
 filters/stripAd/feedburner.sed      |  1 +
 filters/stripAd/yahoo.sed           |  1 +
 planet/config.py                    | 14 ++++++-
 planet/shell/sed.py                 | 19 +++++++++
 planet/spider.py                    | 64 ++++++++++++++++++++++++++++-
 tests/data/config/basic.ini         |  2 +
 tests/data/filter/stripAd-yahoo.ini |  2 +
 tests/data/filter/stripAd-yahoo.xml |  4 ++
 tests/test_config.py                |  4 ++
 tests/test_filters.py               | 22 ++++++++++
 tests/test_scrub.py                 | 57 +++++++++++++++++++++++++
 11 files changed, 187 insertions(+), 3 deletions(-)
 create mode 100644 filters/stripAd/feedburner.sed
 create mode 100644 filters/stripAd/yahoo.sed
 create mode 100644 planet/shell/sed.py
 create mode 100644 tests/data/filter/stripAd-yahoo.ini
 create mode 100644 tests/data/filter/stripAd-yahoo.xml
 create mode 100644 tests/test_scrub.py
diff --git a/filters/stripAd/feedburner.sed b/filters/stripAd/feedburner.sed
new file mode 100644
index 0000000..d203ccd
--- /dev/null
+++ b/filters/stripAd/feedburner.sed
@@ -0,0 +1 @@
+s|<p><a href="http://[a-zA-Z0-9\-\.]*/~a/[a-zA-Z0-9]*?a=[a-zA-Z0-9]*"><img border="0" src="http://[a-zA-Z0-9\.\-]*/~a/[a-zA-Z0-9/]*?i=[a-zA-Z0-9]*"/></a></p>||g
diff --git a/filters/stripAd/yahoo.sed b/filters/stripAd/yahoo.sed
new file mode 100644
index 0000000..03cd9dd
--- /dev/null
+++ b/filters/stripAd/yahoo.sed
@@ -0,0 +1 @@
+s|<p><!-- begin(Yahoo ad) -->.*<!-- end(Yahoo ad) --></p>||
diff --git a/planet/config.py b/planet/config.py
index 7438598..a63f5ad 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -102,7 +102,6 @@ def __init__():
     define_planet_list('template_files')
     define_planet_list('bill_of_materials')
     define_planet_list('template_directories')
-    define_planet_list('filters')
     define_planet_list('filter_directories')
 
     # template options
@@ -111,6 +110,11 @@ def __init__():
     define_tmpl_int('activity_threshold', 0)
     define_tmpl('encoding', 'utf-8')
     define_tmpl('content_type', 'utf-8')
+    define_tmpl('ignore_in_feed', '')
+    define_tmpl('name_type', '')
+    define_tmpl('title_type', '')
+    define_tmpl('summary_type', '')
+    define_tmpl('content_type', '')
 
 def load(config_file):
     """ initialize and load a configuration"""
@@ -271,6 +275,14 @@ def reading_lists():
                 result.append(section)
     return result
 
+def filters(section=None):
+    filters = []
+    if parser.has_option('Planet', 'filters'):
+        filters += parser.get('Planet', 'filters').split()
+    if section and parser.has_option(section, 'filters'):
+        filters += parser.get(section, 'filters').split()
+    return filters
+
 def planet_options():
     """ dictionary of planet wide options"""
     return dict(map(lambda opt: (opt, parser.get('Planet',opt)),
diff --git a/planet/shell/sed.py b/planet/shell/sed.py
new file mode 100644
index 0000000..06082a7
--- /dev/null
+++ b/planet/shell/sed.py
@@ -0,0 +1,19 @@
+from subprocess import Popen, PIPE
+
+def run(script, doc, output_file=None, options={}):
+    """ process an Python script """
+
+    if output_file:
+        out = open(output_file, 'w')
+    else:
+        out = PIPE
+
+    proc = Popen(['sed', '-f', script],
+        stdin=PIPE, stdout=out, stderr=PIPE)
+
+    stdout, stderr = proc.communicate(doc)
+    if stderr:
+        import planet
+        planet.logger.error(stderr)
+
+    return stdout
diff --git a/planet/spider.py b/planet/spider.py
index 7ee0054..97658c9 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -42,6 +42,61 @@ def write(xdoc, out):
     file.write(xdoc)
     file.close()
 
+type_map = {'text': 'text/plain', 'html': 'text/html',
+    'xhtml': 'application/xhtml+xml'}
+
+def scrub(feed, data):
+
+    # some data is not trustworthy
+    for tag in config.ignore_in_feed(feed).split():
+        for entry in data.entries:
+            if entry.has_key(tag): del entry[tag]
+            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
+            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
+
+    # adjust title types
+    if config.title_type(feed):
+        title_type = config.title_type(feed)
+        title_type = type_map.get(title_type, title_type)
+        for entry in data.entries:
+            if entry.has_key('title_detail'):
+                entry.title_detail['type'] = title_type
+
+    # adjust summary types
+    if config.summary_type(feed):
+        summary_type = config.summary_type(feed)
+        summary_type = type_map.get(summary_type, summary_type)
+        for entry in data.entries:
+            if entry.has_key('summary_detail'):
+                entry.summary_detail['type'] = summary_type
+
+    # adjust content types
+    if config.content_type(feed):
+        content_type = config.content_type(feed)
+        content_type = type_map.get(content_type, content_type)
+        for entry in data.entries:
+            if entry.has_key('content'):
+                entry.content[0]['type'] = content_type
+
+    # some people put html in author names
+    if config.name_type(feed).find('html')>=0:
+        from planet.shell.tmpl import stripHtml
+        if data.feed.has_key('author_detail') and \
+            data.feed.author_detail.has_key('name'):
+            data.feed.author_detail['name'] = \
+                str(stripHtml(data.feed.author_detail.name))
+        for entry in data.entries:
+            if entry.has_key('author_detail') and \
+                entry.author_detail.has_key('name'):
+                entry.author_detail['name'] = \
+                    str(stripHtml(entry.author_detail.name))
+            if entry.has_key('source'):
+                source = entry.source
+                if source.has_key('author_detail') and \
+                    source.author_detail.has_key('name'):
+                    source.author_detail['name'] = \
+                        str(stripHtml(source.author_detail.name))
+
 def spiderFeed(feed):
     """ Spider (fetch) a single feed """
 
@@ -136,6 +191,9 @@ def spiderFeed(feed):
     elif data.status >= 400:
        data.feed['planet_message'] = "http status %s" % status
 
+    # perform user configured scrub operations on the data
+    scrub(feed, data)
+
     # write the feed info to the cache
     if not os.path.exists(sources): os.makedirs(sources)
     xdoc=minidom.parseString('''<feed xmlns:planet="%s"
@@ -147,7 +205,6 @@ def spiderFeed(feed):
     # write each entry to the cache
     cache = config.cache_directory()
     for entry in data.entries:
-
         # generate an id, if none is present
         if not entry.has_key('id') or not entry.id:
             entry['id'] = reconstitute.id(None, entry)
@@ -158,6 +215,9 @@ def spiderFeed(feed):
 
         # get updated-date either from the entry or the cache (default to now)
         mtime = None
+        if not entry.has_key('updated_parsed'):
+            if entry.has_key('published_parsed'):
+                entry['updated_parsed'] = entry.published_parsed
         if entry.has_key('updated_parsed'):
             mtime = calendar.timegm(entry.updated_parsed)
             if mtime > time.time(): mtime = None
@@ -172,7 +232,7 @@ def spiderFeed(feed):
         xdoc = reconstitute.reconstitute(data, entry)
         output = xdoc.toxml('utf-8')
         xdoc.unlink()
-        for filter in config.filters():
+        for filter in config.filters(feed):
             output = shell.run(filter, output, mode="filter")
             if not output: return
 
diff --git a/tests/data/config/basic.ini b/tests/data/config/basic.ini
index 022f3ba..1ccaf42 100644
--- a/tests/data/config/basic.ini
+++ b/tests/data/config/basic.ini
@@ -2,6 +2,7 @@
 name = Test Configuration
 template_files = index.html.tmpl atom.xml.tmpl
 items_per_page = 50
+filters = foo
 
 [index.html.tmpl]
 days_per_page = 7
@@ -11,3 +12,4 @@ name = one
 
 [feed2]
 name = two
+filters = bar
diff --git a/tests/data/filter/stripAd-yahoo.ini b/tests/data/filter/stripAd-yahoo.ini
new file mode 100644
index 0000000..c348c73
--- /dev/null
+++ b/tests/data/filter/stripAd-yahoo.ini
@@ -0,0 +1,2 @@
+[Planet]
+filters = stripAd/yahoo.sed
diff --git a/tests/data/filter/stripAd-yahoo.xml b/tests/data/filter/stripAd-yahoo.xml
new file mode 100644
index 0000000..68fb691
--- /dev/null
+++ b/tests/data/filter/stripAd-yahoo.xml
@@ -0,0 +1,4 @@
+<entry xmlns="http://www.w3.org/2005/Atom">
+  <content><div xmlns="http://wwww.w3.org/1999/xhtml">before-<p><!-- begin(Yahoo ad) -->ad content here<!-- end(Yahoo ad) --></p>-after</div></content>
+</entry>
+
diff --git a/tests/test_config.py b/tests/test_config.py
index 14c0239..5b8f1e1 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -50,3 +50,7 @@ class ConfigTest(unittest.TestCase):
         option = config.template_options('index.html.tmpl')
         self.assertEqual('7',  option['days_per_page'])
         self.assertEqual('50', option['items_per_page'])
+
+    def test_filters(self):
+        self.assertEqual(['foo','bar'], config.filters('feed2'))
+        self.assertEqual(['foo'], config.filters('feed1'))
diff --git a/tests/test_filters.py b/tests/test_filters.py
index 7f7a433..14aafe9 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -44,3 +44,25 @@ class FilterTests(unittest.TestCase):
         self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
             u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
             u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
+
+    def test_stripAd_yahoo(self):
+        testfile = 'tests/data/filter/stripAd-yahoo.xml'
+        config.load('tests/data/filter/stripAd-yahoo.ini')
+
+        output = open(testfile).read()
+        for filter in config.filters():
+            output = shell.run(filter, output, mode="filter")
+
+        dom = xml.dom.minidom.parseString(output)
+        excerpt = dom.getElementsByTagName('content')[0]
+        self.assertEqual(u'before--after',
+            excerpt.firstChild.firstChild.nodeValue)
+
+try:
+    from subprocess import Popen, PIPE
+    sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
+    sed.communicate()
+    if sed.returncode != 0: raise Exception
+except:
+    # sed is not available
+    del FilterTests.test_stripAd_yahoo
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
new file mode 100644
index 0000000..dc94e05
--- /dev/null
+++ b/tests/test_scrub.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import unittest, StringIO
+from planet.spider import scrub
+from planet import feedparser, config
+
+feed = '''
+<feed xmlns='http://www.w3.org/2005/Atom'>
+  <author><name>F&amp;ouml;o</name></author>
+  <entry>
+    <id>ignoreme</id>
+    <author><name>F&amp;ouml;o</name></author>
+    <updated>2000-01-01T00:00:00Z</updated>
+    <title>F&amp;ouml;o</title>
+    <summary>F&amp;ouml;o</summary>
+    <content>F&amp;ouml;o</content>
+    <source>
+      <author><name>F&amp;ouml;o</name></author>
+    </source>
+  </entry>
+</feed>
+'''
+
+configData = '''
+[testfeed]
+ignore_in_feed = id updated
+name_type = html
+title_type = html
+summary_type = html
+content_type = html
+'''
+
+class ScrubTest(unittest.TestCase):
+
+    def test_scrub(self):
+        data = feedparser.parse(feed)
+        config.parser.readfp(StringIO.StringIO(configData))
+
+        self.assertEqual('F&ouml;o', data.feed.author_detail.name)
+        self.assertTrue(data.entries[0].has_key('id'))
+        self.assertTrue(data.entries[0].has_key('updated'))
+        self.assertTrue(data.entries[0].has_key('updated_parsed'))
+
+        scrub('testfeed', data)
+
+        self.assertFalse(data.entries[0].has_key('id'))
+        self.assertFalse(data.entries[0].has_key('updated'))
+        self.assertFalse(data.entries[0].has_key('updated_parsed'))
+
+        self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
+        self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
+        self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
+
+        self.assertEqual('text/html', data.entries[0].title_detail.type)
+        self.assertEqual('text/html', data.entries[0].summary_detail.type)
+        self.assertEqual('text/html', data.entries[0].content[0].type)
+