Initial filter support (no parameters)

2006-08-30 23:07:27 -04:00 · 2006-08-30 23:07:27 -04:00 · 9e80c7e77f
commit 9e80c7e77f
parent 4b23c2f967
7 changed files with 108 additions and 22 deletions
--- a/filters/coral_cdn_filter.py
+++ b/filters/coral_cdn_filter.py
@ -0,0 +1,17 @@
+"""
+Remap all images to take advantage of the Coral Content Distribution
+Network <http://www.coralcdn.org/>.
+"""
+
+import sys, urlparse, xml.dom.minidom
+
+entry = xml.dom.minidom.parse(sys.stdin).documentElement
+
+for node in entry.getElementsByTagName('img'):
+    if node.hasAttribute('src'):
+        component = list(urlparse.urlparse(node.getAttribute('src')))
+        if component[0]=='http' and component[1].find(':')<0:
+            component[1] += '.nyud.net:8080'
+            node.setAttribute('src', urlparse.urlunparse(component))
+
+print entry.toxml('utf-8')
--- a/planet/config.py
+++ b/planet/config.py
@ -101,6 +101,8 @@ def __init__():
    define_planet_list('template_files')
    define_planet_list('bill_of_materials')
    define_planet_list('template_directories')
+    define_planet_list('filters')
+    define_planet_list('filter_directories')
    define_planet_list('reading_lists')

    # template options
@ -151,6 +153,12 @@ def load(config_file):
        else:
            log.error('Unable to find theme %s', theme)

+    # Filter support
+    dirs = config.filter_directories()
+    filter_dir = os.path.join(sys.path[0],'filters')
+    if filter_dir not in dirs and os.path.exists(filter_dir):
+        parser.set('Planet', 'filter_directories', ' '.join(dirs+[filter_dir]))
+
    # Reading list support
    reading_lists = config.reading_lists()
    if reading_lists:
@ -209,8 +217,8 @@ def feedtype():

 def subscriptions():
    """ list the feed subscriptions """
-    return filter(lambda feed: feed!='Planet' and feed not in template_files(),
-       parser.sections())
+    return filter(lambda feed: feed!='Planet' and 
+        feed not in template_files()+filters(), parser.sections())

 def planet_options():
    """ dictionary of planet wide options"""
--- a/planet/shell/init.py
+++ b/planet/shell/init.py
@ -2,16 +2,21 @@ import planet
 import os
 import sys

-def run(template_file, doc):
+def run(template_file, doc, mode='template'):
    """ select a template module based on file extension and execute it """
    log = planet.getLogger(planet.config.log_level())

+    if mode == 'template':
+        dirs = planet.config.template_directories()
+    else:
+        dirs = planet.config.filter_directories()
+ 
    # see if the template can be located
-    for template_dir in planet.config.template_directories():
+    for template_dir in dirs:
        template_resolved = os.path.join(template_dir, template_file)
        if os.path.exists(template_resolved): break
    else:
-        return log.error("Unable to locate template %s", template_file)
+        return log.error("Unable to locate %s %s", mode, template_file)

    # Add shell directory to the path, if not already there
    shellpath = os.path.join(sys.path[0],'planet','shell')
@ -20,16 +25,22 @@ def run(template_file, doc):

    # Try loading module for processing this template, based on the extension
    base,ext = os.path.splitext(os.path.basename(template_resolved))
-    template_module_name = ext[1:]
+    module_name = ext[1:]
    try:
-        template_module = __import__(template_module_name)
+        module = __import__(module_name)
    except Exception, inst:
-        return log.error("Skipping template '%s' after failing to load '%s':" +
-            " %s", template_resolved, template_module_name, inst)
+        print module_name
+        return log.error("Skipping %s '%s' after failing to load '%s': %s", 
+            mode, template_resolved, module_name, inst)

    # Execute the shell module
-    log.info("Processing template %s using %s", template_resolved,
-        template_module_name)
-    output_dir = planet.config.output_dir()
-    output_file = os.path.join(output_dir, base)
-    template_module.run(template_resolved, doc, output_file)
+    if mode == 'filter':
+        log.debug("Processing filer %s using %s", template_resolved,
+            module_name)
+        return module.run(template_resolved, doc, None)
+    else:
+        log.info("Processing template %s using %s", template_resolved,
+            module_name)
+        output_dir = planet.config.output_dir()
+        output_file = os.path.join(output_dir, base)
+        module.run(template_resolved, doc, output_file)
--- a/planet/shell/py.py
+++ b/planet/shell/py.py
@ -0,0 +1,16 @@
+from subprocess import Popen, PIPE
+
+def run(script, doc, output_file=None):
+    """ process an Python script """
+
+    if output_file:
+        out = open(output_file, 'w')
+    else:
+        out = PIPE
+
+    proc = Popen(['python', script], stdin=PIPE, stdout=out, stderr=PIPE)
+    stdout, stderr = proc.communicate(doc)
+    if stderr:
+        print stderr
+
+    return stdout
--- a/planet/spider.py
+++ b/planet/spider.py
@ -7,7 +7,7 @@ and write each as a set of entries in a cache directory.
 import time, calendar, re, os
 from xml.dom import minidom
 # Planet modules
-import planet, config, feedparser, reconstitute
+import planet, config, feedparser, reconstitute, shell

 # Regular expressions to sanitise cache filenames
 re_url_scheme    = re.compile(r'^\w+:/*(\w+:|www\.)?')
@ -39,9 +39,8 @@ def filename(directory, filename):
 def write(xdoc, out):
    """ write the document out to disk """
    file = open(out,'w')
-    file.write(xdoc.toxml('utf-8'))
+    file.write(xdoc)
    file.close()
-    xdoc.unlink()

 def spiderFeed(feed):
    """ Spider (fetch) a single feed """
@ -116,30 +115,43 @@ def spiderFeed(feed):
    xdoc=minidom.parseString('''<feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
    reconstitute.source(xdoc.documentElement, data.feed, data.bozo)
-    write(xdoc, filename(sources, feed))
+    write(xdoc.toxml('utf-8'), filename(sources, feed))
+    xdoc.unlink()

    # write each entry to the cache
    cache = config.cache_directory()
    for entry in data.entries:
+
+        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
            if not entry['id']: continue

-        out = filename(cache, entry.id)
+        # compute cache file name based on the id
+        cache_file = filename(cache, entry.id)

+        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        if entry.has_key('updated_parsed'):
            mtime = calendar.timegm(entry.updated_parsed)
            if mtime > time.time(): mtime = None
        if not mtime:
            try:
-                mtime = os.stat(out).st_mtime
+                mtime = os.stat(cache_file).st_mtime
            except:
                mtime = time.time()
            entry['updated_parsed'] = time.gmtime(mtime)

-        write(reconstitute.reconstitute(data, entry), out) 
-        os.utime(out, (mtime, mtime))
+        # apply any filters
+        xdoc = reconstitute.reconstitute(data, entry)
+        output = xdoc.toxml('utf-8')
+        xdoc.unlink()
+        for filter in config.filters():
+            output = shell.run(filter, output, mode="filter")
+
+        # write out and timestamp the results
+        write(output, cache_file) 
+        os.utime(cache_file, (mtime, mtime))

 def spiderPlanet(configFile):
    """ Spider (fetch) an entire planet """
--- a/tests/data/filter/coral_cdn.xml
+++ b/tests/data/filter/coral_cdn.xml
@ -0,0 +1,7 @@
+<entry xmlns="http://www.w3.org/2005/xhtml">
+  <content>
+     <div xmlns="http://www.w3.org/1999/xhtml">
+       <img src="http://example.com/foo.png"/>
+     </div>
+  </content>
+</entry>
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+import unittest, xml.dom.minidom
+from planet import shell
+
+testfile = 'tests/data/filter/coral_cdn.xml'
+filter = 'coral_cdn_filter.py'
+
+class FilterTests(unittest.TestCase):
+
+    def test_coral_cdn(self):
+        output = shell.run(filter, open(testfile).read(), mode="filter")
+        dom = xml.dom.minidom.parseString(output)
+        imgsrc = dom.getElementsByTagName('img')[0].getAttribute('src')
+        self.assertEqual('http://example.com.nyud.net:8080/foo.png', imgsrc)