Per feed filters and scrubbing

2006-09-04 07:14:41 -04:00 · 2006-09-04 07:14:41 -04:00 · 9d85882503
commit 9d85882503
parent 88bdbe930e
11 changed files with 187 additions and 3 deletions
--- a/filters/stripAd/feedburner.sed
+++ b/filters/stripAd/feedburner.sed
@ -0,0 +1 @@
 s|<p><a href="http://[a-zA-Z0-9\-\.]*/~a/[a-zA-Z0-9]*?a=[a-zA-Z0-9]*"><img border="0" src="http://[a-zA-Z0-9\.\-]*/~a/[a-zA-Z0-9/]*?i=[a-zA-Z0-9]*"/></a></p>||g
--- a/filters/stripAd/yahoo.sed
+++ b/filters/stripAd/yahoo.sed
@ -0,0 +1 @@
 s|<p><!-- begin(Yahoo ad) -->.*<!-- end(Yahoo ad) --></p>||
--- a/planet/config.py
+++ b/planet/config.py
@ -102,7 +102,6 @@ def __init__():
    define_planet_list('template_files')
    define_planet_list('bill_of_materials')
    define_planet_list('template_directories')
    define_planet_list('filters')
    define_planet_list('filter_directories')
    # template options
@ -111,6 +110,11 @@ def __init__():
    define_tmpl_int('activity_threshold', 0)
    define_tmpl('encoding', 'utf-8')
    define_tmpl('content_type', 'utf-8')
    define_tmpl('ignore_in_feed', '')
    define_tmpl('name_type', '')
    define_tmpl('title_type', '')
    define_tmpl('summary_type', '')
    define_tmpl('content_type', '')
 def load(config_file):
    """ initialize and load a configuration"""
@ -271,6 +275,14 @@ def reading_lists():
                result.append(section)
    return result
 def filters(section=None):
    filters = []
    if parser.has_option('Planet', 'filters'):
        filters += parser.get('Planet', 'filters').split()
    if section and parser.has_option(section, 'filters'):
        filters += parser.get(section, 'filters').split()
    return filters
 def planet_options():
    """ dictionary of planet wide options"""
    return dict(map(lambda opt: (opt, parser.get('Planet',opt)),
--- a/planet/shell/sed.py
+++ b/planet/shell/sed.py
@ -0,0 +1,19 @@
 from subprocess import Popen, PIPE
 def run(script, doc, output_file=None, options={}):
    """ process an Python script """
    if output_file:
        out = open(output_file, 'w')
    else:
        out = PIPE
    proc = Popen(['sed', '-f', script],
        stdin=PIPE, stdout=out, stderr=PIPE)
    stdout, stderr = proc.communicate(doc)
    if stderr:
        import planet
        planet.logger.error(stderr)
    return stdout
--- a/planet/spider.py
+++ b/planet/spider.py
@ -42,6 +42,61 @@ def write(xdoc, out):
    file.write(xdoc)
    file.close()
 type_map = {'text': 'text/plain', 'html': 'text/html',
    'xhtml': 'application/xhtml+xml'}
 def scrub(feed, data):
    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed).split():
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
    # adjust title types
    if config.title_type(feed):
        title_type = config.title_type(feed)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type
    # adjust summary types
    if config.summary_type(feed):
        summary_type = config.summary_type(feed)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type
    # adjust content types
    if config.content_type(feed):
        content_type = config.content_type(feed)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type
    # some people put html in author names
    if config.name_type(feed).find('html')>=0:
        from planet.shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))
 def spiderFeed(feed):
    """ Spider (fetch) a single feed """
@ -136,6 +191,9 @@ def spiderFeed(feed):
    elif data.status >= 400:
       data.feed['planet_message'] = "http status %s" % status
    # perform user configured scrub operations on the data
    scrub(feed, data)
    # write the feed info to the cache
    if not os.path.exists(sources): os.makedirs(sources)
    xdoc=minidom.parseString('''<feed xmlns:planet="%s"
@ -147,7 +205,6 @@ def spiderFeed(feed):
    # write each entry to the cache
    cache = config.cache_directory()
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
@ -158,6 +215,9 @@ def spiderFeed(feed):
        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        if not entry.has_key('updated_parsed'):
            if entry.has_key('published_parsed'):
                entry['updated_parsed'] = entry.published_parsed
        if entry.has_key('updated_parsed'):
            mtime = calendar.timegm(entry.updated_parsed)
            if mtime > time.time(): mtime = None
@ -172,7 +232,7 @@ def spiderFeed(feed):
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml('utf-8')
        xdoc.unlink()
-        for filter in config.filters():
+        for filter in config.filters(feed):
            output = shell.run(filter, output, mode="filter")
            if not output: return
--- a/tests/data/config/basic.ini
+++ b/tests/data/config/basic.ini
@ -2,6 +2,7 @@
 name = Test Configuration
 template_files = index.html.tmpl atom.xml.tmpl
 items_per_page = 50
 filters = foo
 [index.html.tmpl]
 days_per_page = 7
@ -11,3 +12,4 @@ name = one
 [feed2]
 name = two
 filters = bar
--- a/tests/data/filter/stripAd-yahoo.ini
+++ b/tests/data/filter/stripAd-yahoo.ini
@ -0,0 +1,2 @@
 [Planet]
 filters = stripAd/yahoo.sed
--- a/tests/data/filter/stripAd-yahoo.xml
+++ b/tests/data/filter/stripAd-yahoo.xml
@ -0,0 +1,4 @@
 <entry xmlns="http://www.w3.org/2005/Atom">
  <content><div xmlns="http://wwww.w3.org/1999/xhtml">before-<p><!-- begin(Yahoo ad) -->ad content here<!-- end(Yahoo ad) --></p>-after</div></content>
 </entry>
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -50,3 +50,7 @@ class ConfigTest(unittest.TestCase):
        option = config.template_options('index.html.tmpl')
        self.assertEqual('7',  option['days_per_page'])
        self.assertEqual('50', option['items_per_page'])
    def test_filters(self):
        self.assertEqual(['foo','bar'], config.filters('feed2'))
        self.assertEqual(['foo'], config.filters('feed1'))
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@ -44,3 +44,25 @@ class FilterTests(unittest.TestCase):
        self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
            u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
            u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
    def test_stripAd_yahoo(self):
        testfile = 'tests/data/filter/stripAd-yahoo.xml'
        config.load('tests/data/filter/stripAd-yahoo.ini')
        output = open(testfile).read()
        for filter in config.filters():
            output = shell.run(filter, output, mode="filter")
        dom = xml.dom.minidom.parseString(output)
        excerpt = dom.getElementsByTagName('content')[0]
        self.assertEqual(u'before--after',
            excerpt.firstChild.firstChild.nodeValue)
 try:
    from subprocess import Popen, PIPE
    sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
    sed.communicate()
    if sed.returncode != 0: raise Exception
 except:
    # sed is not available
    del FilterTests.test_stripAd_yahoo
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@ -0,0 +1,57 @@
 #!/usr/bin/env python
 import unittest, StringIO
 from planet.spider import scrub
 from planet import feedparser, config
 feed = '''
 <feed xmlns='http://www.w3.org/2005/Atom'>
  <author><name>F&amp;ouml;o</name></author>
  <entry>
    <id>ignoreme</id>
    <author><name>F&amp;ouml;o</name></author>
    <updated>2000-01-01T00:00:00Z</updated>
    <title>F&amp;ouml;o</title>
    <summary>F&amp;ouml;o</summary>
    <content>F&amp;ouml;o</content>
    <source>
      <author><name>F&amp;ouml;o</name></author>
    </source>
  </entry>
 </feed>
 '''
 configData = '''
 [testfeed]
 ignore_in_feed = id updated
 name_type = html
 title_type = html
 summary_type = html
 content_type = html
 '''
 class ScrubTest(unittest.TestCase):
    def test_scrub(self):
        data = feedparser.parse(feed)
        config.parser.readfp(StringIO.StringIO(configData))
        self.assertEqual('F&ouml;o', data.feed.author_detail.name)
        self.assertTrue(data.entries[0].has_key('id'))
        self.assertTrue(data.entries[0].has_key('updated'))
        self.assertTrue(data.entries[0].has_key('updated_parsed'))
        scrub('testfeed', data)
        self.assertFalse(data.entries[0].has_key('id'))
        self.assertFalse(data.entries[0].has_key('updated'))
        self.assertFalse(data.entries[0].has_key('updated_parsed'))
        self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
        self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
        self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
        self.assertEqual('text/html', data.entries[0].title_detail.type)
        self.assertEqual('text/html', data.entries[0].summary_detail.type)
        self.assertEqual('text/html', data.entries[0].content[0].type)
		`@ -0,0 +1 @@`
							`s\|<p><a href="http://[a-zA-Z0-9\-\.]/~a/[a-zA-Z0-9]?a=[a-zA-Z0-9]"><img border="0" src="http://[a-zA-Z0-9\.\-]/~a/[a-zA-Z0-9/]?i=[a-zA-Z0-9]"/></a></p>\|\|g`
		`@ -0,0 +1 @@`
							`s\|<p><!-- begin(Yahoo ad) -->.*<!-- end(Yahoo ad) --></p>\|\|`