diff --git a/filters/stripAd/feedburner.sed b/filters/stripAd/feedburner.sed new file mode 100644 index 0000000..d203ccd --- /dev/null +++ b/filters/stripAd/feedburner.sed @@ -0,0 +1 @@ +s|
||g diff --git a/filters/stripAd/yahoo.sed b/filters/stripAd/yahoo.sed new file mode 100644 index 0000000..03cd9dd --- /dev/null +++ b/filters/stripAd/yahoo.sed @@ -0,0 +1 @@ +s|.*
|| diff --git a/planet/config.py b/planet/config.py index 7438598..a63f5ad 100644 --- a/planet/config.py +++ b/planet/config.py @@ -102,7 +102,6 @@ def __init__(): define_planet_list('template_files') define_planet_list('bill_of_materials') define_planet_list('template_directories') - define_planet_list('filters') define_planet_list('filter_directories') # template options @@ -111,6 +110,11 @@ def __init__(): define_tmpl_int('activity_threshold', 0) define_tmpl('encoding', 'utf-8') define_tmpl('content_type', 'utf-8') + define_tmpl('ignore_in_feed', '') + define_tmpl('name_type', '') + define_tmpl('title_type', '') + define_tmpl('summary_type', '') + define_tmpl('content_type', '') def load(config_file): """ initialize and load a configuration""" @@ -271,6 +275,14 @@ def reading_lists(): result.append(section) return result +def filters(section=None): + filters = [] + if parser.has_option('Planet', 'filters'): + filters += parser.get('Planet', 'filters').split() + if section and parser.has_option(section, 'filters'): + filters += parser.get(section, 'filters').split() + return filters + def planet_options(): """ dictionary of planet wide options""" return dict(map(lambda opt: (opt, parser.get('Planet',opt)), diff --git a/planet/shell/sed.py b/planet/shell/sed.py new file mode 100644 index 0000000..06082a7 --- /dev/null +++ b/planet/shell/sed.py @@ -0,0 +1,19 @@ +from subprocess import Popen, PIPE + +def run(script, doc, output_file=None, options={}): + """ process an Python script """ + + if output_file: + out = open(output_file, 'w') + else: + out = PIPE + + proc = Popen(['sed', '-f', script], + stdin=PIPE, stdout=out, stderr=PIPE) + + stdout, stderr = proc.communicate(doc) + if stderr: + import planet + planet.logger.error(stderr) + + return stdout diff --git a/planet/spider.py b/planet/spider.py index 7ee0054..97658c9 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -42,6 +42,61 @@ def write(xdoc, out): file.write(xdoc) file.close() +type_map = {'text': 'text/plain', 'html': 'text/html', + 'xhtml': 'application/xhtml+xml'} + +def scrub(feed, data): + + # some data is not trustworthy + for tag in config.ignore_in_feed(feed).split(): + for entry in data.entries: + if entry.has_key(tag): del entry[tag] + if entry.has_key(tag + "_detail"): del entry[tag + "_detail"] + if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"] + + # adjust title types + if config.title_type(feed): + title_type = config.title_type(feed) + title_type = type_map.get(title_type, title_type) + for entry in data.entries: + if entry.has_key('title_detail'): + entry.title_detail['type'] = title_type + + # adjust summary types + if config.summary_type(feed): + summary_type = config.summary_type(feed) + summary_type = type_map.get(summary_type, summary_type) + for entry in data.entries: + if entry.has_key('summary_detail'): + entry.summary_detail['type'] = summary_type + + # adjust content types + if config.content_type(feed): + content_type = config.content_type(feed) + content_type = type_map.get(content_type, content_type) + for entry in data.entries: + if entry.has_key('content'): + entry.content[0]['type'] = content_type + + # some people put html in author names + if config.name_type(feed).find('html')>=0: + from planet.shell.tmpl import stripHtml + if data.feed.has_key('author_detail') and \ + data.feed.author_detail.has_key('name'): + data.feed.author_detail['name'] = \ + str(stripHtml(data.feed.author_detail.name)) + for entry in data.entries: + if entry.has_key('author_detail') and \ + entry.author_detail.has_key('name'): + entry.author_detail['name'] = \ + str(stripHtml(entry.author_detail.name)) + if entry.has_key('source'): + source = entry.source + if source.has_key('author_detail') and \ + source.author_detail.has_key('name'): + source.author_detail['name'] = \ + str(stripHtml(source.author_detail.name)) + def spiderFeed(feed): """ Spider (fetch) a single feed """ @@ -136,6 +191,9 @@ def spiderFeed(feed): elif data.status >= 400: data.feed['planet_message'] = "http status %s" % status + # perform user configured scrub operations on the data + scrub(feed, data) + # write the feed info to the cache if not os.path.exists(sources): os.makedirs(sources) xdoc=minidom.parseString('''ad content here
-after