From 9e80c7e77f11462128ac778ef4f25f0c373220f9 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 30 Aug 2006 23:07:27 -0400 Subject: [PATCH] Initial filter support (no parameters) --- filters/coral_cdn_filter.py | 17 ++++++++++++++++ planet/config.py | 12 +++++++++-- planet/shell/__init__.py | 35 ++++++++++++++++++++++----------- planet/shell/py.py | 16 +++++++++++++++ planet/spider.py | 28 ++++++++++++++++++-------- tests/data/filter/coral_cdn.xml | 7 +++++++ tests/test_filters.py | 15 ++++++++++++++ 7 files changed, 108 insertions(+), 22 deletions(-) create mode 100644 filters/coral_cdn_filter.py create mode 100644 planet/shell/py.py create mode 100644 tests/data/filter/coral_cdn.xml create mode 100644 tests/test_filters.py diff --git a/filters/coral_cdn_filter.py b/filters/coral_cdn_filter.py new file mode 100644 index 0000000..0192c63 --- /dev/null +++ b/filters/coral_cdn_filter.py @@ -0,0 +1,17 @@ +""" +Remap all images to take advantage of the Coral Content Distribution +Network . +""" + +import sys, urlparse, xml.dom.minidom + +entry = xml.dom.minidom.parse(sys.stdin).documentElement + +for node in entry.getElementsByTagName('img'): + if node.hasAttribute('src'): + component = list(urlparse.urlparse(node.getAttribute('src'))) + if component[0]=='http' and component[1].find(':')<0: + component[1] += '.nyud.net:8080' + node.setAttribute('src', urlparse.urlunparse(component)) + +print entry.toxml('utf-8') diff --git a/planet/config.py b/planet/config.py index 9a56353..5b3b187 100644 --- a/planet/config.py +++ b/planet/config.py @@ -101,6 +101,8 @@ def __init__(): define_planet_list('template_files') define_planet_list('bill_of_materials') define_planet_list('template_directories') + define_planet_list('filters') + define_planet_list('filter_directories') define_planet_list('reading_lists') # template options @@ -151,6 +153,12 @@ def load(config_file): else: log.error('Unable to find theme %s', theme) + # Filter support + dirs = config.filter_directories() + filter_dir = os.path.join(sys.path[0],'filters') + if filter_dir not in dirs and os.path.exists(filter_dir): + parser.set('Planet', 'filter_directories', ' '.join(dirs+[filter_dir])) + # Reading list support reading_lists = config.reading_lists() if reading_lists: @@ -209,8 +217,8 @@ def feedtype(): def subscriptions(): """ list the feed subscriptions """ - return filter(lambda feed: feed!='Planet' and feed not in template_files(), - parser.sections()) + return filter(lambda feed: feed!='Planet' and + feed not in template_files()+filters(), parser.sections()) def planet_options(): """ dictionary of planet wide options""" diff --git a/planet/shell/__init__.py b/planet/shell/__init__.py index d1fe258..4d83661 100644 --- a/planet/shell/__init__.py +++ b/planet/shell/__init__.py @@ -2,16 +2,21 @@ import planet import os import sys -def run(template_file, doc): +def run(template_file, doc, mode='template'): """ select a template module based on file extension and execute it """ log = planet.getLogger(planet.config.log_level()) + if mode == 'template': + dirs = planet.config.template_directories() + else: + dirs = planet.config.filter_directories() + # see if the template can be located - for template_dir in planet.config.template_directories(): + for template_dir in dirs: template_resolved = os.path.join(template_dir, template_file) if os.path.exists(template_resolved): break else: - return log.error("Unable to locate template %s", template_file) + return log.error("Unable to locate %s %s", mode, template_file) # Add shell directory to the path, if not already there shellpath = os.path.join(sys.path[0],'planet','shell') @@ -20,16 +25,22 @@ def run(template_file, doc): # Try loading module for processing this template, based on the extension base,ext = os.path.splitext(os.path.basename(template_resolved)) - template_module_name = ext[1:] + module_name = ext[1:] try: - template_module = __import__(template_module_name) + module = __import__(module_name) except Exception, inst: - return log.error("Skipping template '%s' after failing to load '%s':" + - " %s", template_resolved, template_module_name, inst) + print module_name + return log.error("Skipping %s '%s' after failing to load '%s': %s", + mode, template_resolved, module_name, inst) # Execute the shell module - log.info("Processing template %s using %s", template_resolved, - template_module_name) - output_dir = planet.config.output_dir() - output_file = os.path.join(output_dir, base) - template_module.run(template_resolved, doc, output_file) + if mode == 'filter': + log.debug("Processing filer %s using %s", template_resolved, + module_name) + return module.run(template_resolved, doc, None) + else: + log.info("Processing template %s using %s", template_resolved, + module_name) + output_dir = planet.config.output_dir() + output_file = os.path.join(output_dir, base) + module.run(template_resolved, doc, output_file) diff --git a/planet/shell/py.py b/planet/shell/py.py new file mode 100644 index 0000000..cb233fb --- /dev/null +++ b/planet/shell/py.py @@ -0,0 +1,16 @@ +from subprocess import Popen, PIPE + +def run(script, doc, output_file=None): + """ process an Python script """ + + if output_file: + out = open(output_file, 'w') + else: + out = PIPE + + proc = Popen(['python', script], stdin=PIPE, stdout=out, stderr=PIPE) + stdout, stderr = proc.communicate(doc) + if stderr: + print stderr + + return stdout diff --git a/planet/spider.py b/planet/spider.py index e39b241..ec05e27 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -7,7 +7,7 @@ and write each as a set of entries in a cache directory. import time, calendar, re, os from xml.dom import minidom # Planet modules -import planet, config, feedparser, reconstitute +import planet, config, feedparser, reconstitute, shell # Regular expressions to sanitise cache filenames re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?') @@ -39,9 +39,8 @@ def filename(directory, filename): def write(xdoc, out): """ write the document out to disk """ file = open(out,'w') - file.write(xdoc.toxml('utf-8')) + file.write(xdoc) file.close() - xdoc.unlink() def spiderFeed(feed): """ Spider (fetch) a single feed """ @@ -116,30 +115,43 @@ def spiderFeed(feed): xdoc=minidom.parseString('''\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, data.bozo) - write(xdoc, filename(sources, feed)) + write(xdoc.toxml('utf-8'), filename(sources, feed)) + xdoc.unlink() # write each entry to the cache cache = config.cache_directory() for entry in data.entries: + + # generate an id, if none is present if not entry.has_key('id') or not entry.id: entry['id'] = reconstitute.id(None, entry) if not entry['id']: continue - out = filename(cache, entry.id) + # compute cache file name based on the id + cache_file = filename(cache, entry.id) + # get updated-date either from the entry or the cache (default to now) mtime = None if entry.has_key('updated_parsed'): mtime = calendar.timegm(entry.updated_parsed) if mtime > time.time(): mtime = None if not mtime: try: - mtime = os.stat(out).st_mtime + mtime = os.stat(cache_file).st_mtime except: mtime = time.time() entry['updated_parsed'] = time.gmtime(mtime) - write(reconstitute.reconstitute(data, entry), out) - os.utime(out, (mtime, mtime)) + # apply any filters + xdoc = reconstitute.reconstitute(data, entry) + output = xdoc.toxml('utf-8') + xdoc.unlink() + for filter in config.filters(): + output = shell.run(filter, output, mode="filter") + + # write out and timestamp the results + write(output, cache_file) + os.utime(cache_file, (mtime, mtime)) def spiderPlanet(configFile): """ Spider (fetch) an entire planet """ diff --git a/tests/data/filter/coral_cdn.xml b/tests/data/filter/coral_cdn.xml new file mode 100644 index 0000000..3c45248 --- /dev/null +++ b/tests/data/filter/coral_cdn.xml @@ -0,0 +1,7 @@ + + +
+ +
+
+
diff --git a/tests/test_filters.py b/tests/test_filters.py new file mode 100644 index 0000000..5dc2938 --- /dev/null +++ b/tests/test_filters.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +import unittest, xml.dom.minidom +from planet import shell + +testfile = 'tests/data/filter/coral_cdn.xml' +filter = 'coral_cdn_filter.py' + +class FilterTests(unittest.TestCase): + + def test_coral_cdn(self): + output = shell.run(filter, open(testfile).read(), mode="filter") + dom = xml.dom.minidom.parseString(output) + imgsrc = dom.getElementsByTagName('img')[0].getAttribute('src') + self.assertEqual('http://example.com.nyud.net:8080/foo.png', imgsrc)