Per feed filters and scrubbing

2006-09-04 07:14:41 -04:00 · 2006-09-04 07:14:41 -04:00 · 9d85882503
commit 9d85882503
parent 88bdbe930e
11 changed files with 187 additions and 3 deletions
--- a/filters/stripAd/feedburner.sed
+++ b/filters/stripAd/feedburner.sed
@ -0,0 +1 @@
+s|<p><a href="http://[a-zA-Z0-9\-\.]*/~a/[a-zA-Z0-9]*?a=[a-zA-Z0-9]*"><img border="0" src="http://[a-zA-Z0-9\.\-]*/~a/[a-zA-Z0-9/]*?i=[a-zA-Z0-9]*"/></a></p>||g
--- a/filters/stripAd/yahoo.sed
+++ b/filters/stripAd/yahoo.sed
@ -0,0 +1 @@
+s|<p><!-- begin(Yahoo ad) -->.*<!-- end(Yahoo ad) --></p>||
--- a/planet/config.py
+++ b/planet/config.py
@ -102,7 +102,6 @@ def __init__():
    define_planet_list('template_files')
    define_planet_list('bill_of_materials')
    define_planet_list('template_directories')
-    define_planet_list('filters')
    define_planet_list('filter_directories')

    # template options
@ -111,6 +110,11 @@ def __init__():
    define_tmpl_int('activity_threshold', 0)
    define_tmpl('encoding', 'utf-8')
    define_tmpl('content_type', 'utf-8')
+    define_tmpl('ignore_in_feed', '')
+    define_tmpl('name_type', '')
+    define_tmpl('title_type', '')
+    define_tmpl('summary_type', '')
+    define_tmpl('content_type', '')

 def load(config_file):
    """ initialize and load a configuration"""
@ -271,6 +275,14 @@ def reading_lists():
                result.append(section)
    return result

+def filters(section=None):
+    filters = []
+    if parser.has_option('Planet', 'filters'):
+        filters += parser.get('Planet', 'filters').split()
+    if section and parser.has_option(section, 'filters'):
+        filters += parser.get(section, 'filters').split()
+    return filters
+
 def planet_options():
    """ dictionary of planet wide options"""
    return dict(map(lambda opt: (opt, parser.get('Planet',opt)),
--- a/planet/shell/sed.py
+++ b/planet/shell/sed.py
@ -0,0 +1,19 @@
+from subprocess import Popen, PIPE
+
+def run(script, doc, output_file=None, options={}):
+    """ process an Python script """
+
+    if output_file:
+        out = open(output_file, 'w')
+    else:
+        out = PIPE
+
+    proc = Popen(['sed', '-f', script],
+        stdin=PIPE, stdout=out, stderr=PIPE)
+
+    stdout, stderr = proc.communicate(doc)
+    if stderr:
+        import planet
+        planet.logger.error(stderr)
+
+    return stdout
--- a/planet/spider.py
+++ b/planet/spider.py
@ -42,6 +42,61 @@ def write(xdoc, out):
    file.write(xdoc)
    file.close()

+type_map = {'text': 'text/plain', 'html': 'text/html',
+    'xhtml': 'application/xhtml+xml'}
+
+def scrub(feed, data):
+
+    # some data is not trustworthy
+    for tag in config.ignore_in_feed(feed).split():
+        for entry in data.entries:
+            if entry.has_key(tag): del entry[tag]
+            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
+            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
+
+    # adjust title types
+    if config.title_type(feed):
+        title_type = config.title_type(feed)
+        title_type = type_map.get(title_type, title_type)
+        for entry in data.entries:
+            if entry.has_key('title_detail'):
+                entry.title_detail['type'] = title_type
+
+    # adjust summary types
+    if config.summary_type(feed):
+        summary_type = config.summary_type(feed)
+        summary_type = type_map.get(summary_type, summary_type)
+        for entry in data.entries:
+            if entry.has_key('summary_detail'):
+                entry.summary_detail['type'] = summary_type
+
+    # adjust content types
+    if config.content_type(feed):
+        content_type = config.content_type(feed)
+        content_type = type_map.get(content_type, content_type)
+        for entry in data.entries:
+            if entry.has_key('content'):
+                entry.content[0]['type'] = content_type
+
+    # some people put html in author names
+    if config.name_type(feed).find('html')>=0:
+        from planet.shell.tmpl import stripHtml
+        if data.feed.has_key('author_detail') and \
+            data.feed.author_detail.has_key('name'):
+            data.feed.author_detail['name'] = \
+                str(stripHtml(data.feed.author_detail.name))
+        for entry in data.entries:
+            if entry.has_key('author_detail') and \
+                entry.author_detail.has_key('name'):
+                entry.author_detail['name'] = \
+                    str(stripHtml(entry.author_detail.name))
+            if entry.has_key('source'):
+                source = entry.source
+                if source.has_key('author_detail') and \
+                    source.author_detail.has_key('name'):
+                    source.author_detail['name'] = \
+                        str(stripHtml(source.author_detail.name))
+
 def spiderFeed(feed):
    """ Spider (fetch) a single feed """

@ -136,6 +191,9 @@ def spiderFeed(feed):
    elif data.status >= 400:
       data.feed['planet_message'] = "http status %s" % status

+    # perform user configured scrub operations on the data
+    scrub(feed, data)
+
    # write the feed info to the cache
    if not os.path.exists(sources): os.makedirs(sources)
    xdoc=minidom.parseString('''<feed xmlns:planet="%s"
@ -147,7 +205,6 @@ def spiderFeed(feed):
    # write each entry to the cache
    cache = config.cache_directory()
    for entry in data.entries:
-
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
@ -158,6 +215,9 @@ def spiderFeed(feed):

        # get updated-date either from the entry or the cache (default to now)
        mtime = None
+        if not entry.has_key('updated_parsed'):
+            if entry.has_key('published_parsed'):
+                entry['updated_parsed'] = entry.published_parsed
        if entry.has_key('updated_parsed'):
            mtime = calendar.timegm(entry.updated_parsed)
            if mtime > time.time(): mtime = None
@ -172,7 +232,7 @@ def spiderFeed(feed):
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml('utf-8')
        xdoc.unlink()
-        for filter in config.filters():
+        for filter in config.filters(feed):
            output = shell.run(filter, output, mode="filter")
            if not output: return

--- a/tests/data/config/basic.ini
+++ b/tests/data/config/basic.ini
@ -2,6 +2,7 @@
 name = Test Configuration
 template_files = index.html.tmpl atom.xml.tmpl
 items_per_page = 50
+filters = foo

 [index.html.tmpl]
 days_per_page = 7
@ -11,3 +12,4 @@ name = one

 [feed2]
 name = two
+filters = bar
--- a/tests/data/filter/stripAd-yahoo.ini
+++ b/tests/data/filter/stripAd-yahoo.ini
@ -0,0 +1,2 @@
+[Planet]
+filters = stripAd/yahoo.sed
--- a/tests/data/filter/stripAd-yahoo.xml
+++ b/tests/data/filter/stripAd-yahoo.xml
@ -0,0 +1,4 @@
+<entry xmlns="http://www.w3.org/2005/Atom">
+  <content><div xmlns="http://wwww.w3.org/1999/xhtml">before-<p><!-- begin(Yahoo ad) -->ad content here<!-- end(Yahoo ad) --></p>-after</div></content>
+</entry>
+
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -50,3 +50,7 @@ class ConfigTest(unittest.TestCase):
        option = config.template_options('index.html.tmpl')
        self.assertEqual('7',  option['days_per_page'])
        self.assertEqual('50', option['items_per_page'])
+
+    def test_filters(self):
+        self.assertEqual(['foo','bar'], config.filters('feed2'))
+        self.assertEqual(['foo'], config.filters('feed1'))
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@ -44,3 +44,25 @@ class FilterTests(unittest.TestCase):
        self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
            u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
            u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
+
+    def test_stripAd_yahoo(self):
+        testfile = 'tests/data/filter/stripAd-yahoo.xml'
+        config.load('tests/data/filter/stripAd-yahoo.ini')
+
+        output = open(testfile).read()
+        for filter in config.filters():
+            output = shell.run(filter, output, mode="filter")
+
+        dom = xml.dom.minidom.parseString(output)
+        excerpt = dom.getElementsByTagName('content')[0]
+        self.assertEqual(u'before--after',
+            excerpt.firstChild.firstChild.nodeValue)
+
+try:
+    from subprocess import Popen, PIPE
+    sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
+    sed.communicate()
+    if sed.returncode != 0: raise Exception
+except:
+    # sed is not available
+    del FilterTests.test_stripAd_yahoo
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+import unittest, StringIO
+from planet.spider import scrub
+from planet import feedparser, config
+
+feed = '''
+<feed xmlns='http://www.w3.org/2005/Atom'>
+  <author><name>F&amp;ouml;o</name></author>
+  <entry>
+    <id>ignoreme</id>
+    <author><name>F&amp;ouml;o</name></author>
+    <updated>2000-01-01T00:00:00Z</updated>
+    <title>F&amp;ouml;o</title>
+    <summary>F&amp;ouml;o</summary>
+    <content>F&amp;ouml;o</content>
+    <source>
+      <author><name>F&amp;ouml;o</name></author>
+    </source>
+  </entry>
+</feed>
+'''
+
+configData = '''
+[testfeed]
+ignore_in_feed = id updated
+name_type = html
+title_type = html
+summary_type = html
+content_type = html
+'''
+
+class ScrubTest(unittest.TestCase):
+
+    def test_scrub(self):
+        data = feedparser.parse(feed)
+        config.parser.readfp(StringIO.StringIO(configData))
+
+        self.assertEqual('F&ouml;o', data.feed.author_detail.name)
+        self.assertTrue(data.entries[0].has_key('id'))
+        self.assertTrue(data.entries[0].has_key('updated'))
+        self.assertTrue(data.entries[0].has_key('updated_parsed'))
+
+        scrub('testfeed', data)
+
+        self.assertFalse(data.entries[0].has_key('id'))
+        self.assertFalse(data.entries[0].has_key('updated'))
+        self.assertFalse(data.entries[0].has_key('updated_parsed'))
+
+        self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
+        self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
+        self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
+
+        self.assertEqual('text/html', data.entries[0].title_detail.type)
+        self.assertEqual('text/html', data.entries[0].summary_detail.type)
+        self.assertEqual('text/html', data.entries[0].content[0].type)
+
				`@ -0,0 +1 @@`
				`s\|<p><a href="http://[a-zA-Z0-9\-\.]/~a/[a-zA-Z0-9]?a=[a-zA-Z0-9]"><img border="0" src="http://[a-zA-Z0-9\.\-]/~a/[a-zA-Z0-9/]?i=[a-zA-Z0-9]"/></a></p>\|\|g`
				`@ -0,0 +1 @@`
				`s\|<p><!-- begin(Yahoo ad) -->.*<!-- end(Yahoo ad) --></p>\|\|`