regexp sifter

2007-02-15 19:09:10 -05:00 · 2007-02-15 19:09:10 -05:00 · a0642afec0
commit a0642afec0
parent 81d10e1f5c
6 changed files with 86 additions and 3 deletions
--- a/docs/config.html
+++ b/docs/config.html
@ -61,8 +61,13 @@ material information.</dd>
 can be found</dd>
 <dt><ins>bill_of_materials</ins></dt>
 <dd>Space-separated list of files to be copied as is directly from the <code>template_directories</code> to the <code>output_dir</code></dd>
+<dt>filter</dt>
+<dd>Regular expression that must be found in the textual portion of the entry</dd>
+<dt>exclude</dt>
+<dd>Regular expression that must <b>not</b> be found in the textual portion of the entry</dd>
 <dt><ins>filters</ins></dt>
-<dd>Space-separated list of filters to apply to each entry</dd>
+<dd>Space-separated list of <a href="filters.html">filters</a> to apply to
+each entry</dd>

 </dl>
 <dl class="compact code">
--- a/docs/filters.html
+++ b/docs/filters.html
@ -46,6 +46,11 @@ expressions.  Again, parameters can be passed as
 <a href="../tests/data/filter/xpath-sifter2.ini">URI style</a>.
 </p>

+<p>The <a href="../filters/regexp_sifter.py">regexp sifter</a> operates just
+like the xpath sifter, except it uses
+<a href="http://docs.python.org/lib/re-syntax.html">regular expressions</a>
+instead of XPath expressions.</p>
+
 <h3>Notes</h3>

 <ul>
--- a/filters/regexp_sifter.py
+++ b/filters/regexp_sifter.py
@ -0,0 +1,44 @@
+import sys, re
+
+# parse options
+options = dict(zip(sys.argv[1::2],sys.argv[2::2]))
+
+# read entry
+doc = data = sys.stdin.read()
+
+# Apply a sequence of patterns which turn a normalized Atom entry into
+# a stream of text, after removal of non-human metadata.
+for pattern,replacement in [
+  (re.compile('<id>.*?</id>'),' '),
+  (re.compile('<url>.*?</url>'),' '),
+  (re.compile('<source>.*?</source>'),' '),
+  (re.compile('<updated.*?</updated>'),' '),
+  (re.compile('<published.*?</published>'),' '),
+  (re.compile('<link .*?>'),' '),
+  (re.compile('''<[^>]* alt=['"]([^'"]*)['"].*?>'''),r' \1 '),
+  (re.compile('''<[^>]* title=['"]([^'"]*)['"].*?>'''),r' \1 '),
+  (re.compile('''<[^>]* label=['"]([^'"]*)['"].*?>'''),r' \1 '),
+  (re.compile('''<[^>]* term=['"]([^'"]*)['"].*?>'''),r' \1 '),
+  (re.compile('<.*?>'),' '),
+  (re.compile('\s+'),' '),
+  (re.compile('&gt;'),'>'),
+  (re.compile('&lt;'),'<'),
+  (re.compile('&apos;'),"'"),
+  (re.compile('&quot;'),'"'),
+  (re.compile('&amp;'),'&'),
+  (re.compile('\s+'),' ')
+]:
+  data=pattern.sub(replacement,data)
+
+# process requirements
+if options.has_key('--require'):
+  for regexp in options['--require'].split('\n'):
+     if regexp and not re.search(regexp,data): sys.exit(1)
+
+# process exclusions
+if options.has_key('--exclude'):
+  for regexp in options['--exclude'].split('\n'):
+     if regexp and re.search(regexp,data): sys.exit(1)
+
+# if we get this far, the feed is to be included
+print doc
--- a/planet/config.py
+++ b/planet/config.py
@ -26,7 +26,7 @@ Todo:
  * error handling (example: no planet section)
 """

-import os, sys, re
+import os, sys, re, urllib
 from ConfigParser import ConfigParser
 from urlparse import urljoin

@ -126,6 +126,8 @@ def __init__():
    define_tmpl('content_type', '')
    define_tmpl('future_dates', 'keep')
    define_tmpl('xml_base', '')
+    define_tmpl('filter', None) 
+    define_tmpl('exclude', None) 

 def load(config_file):
    """ initialize and load a configuration"""
@ -330,7 +332,7 @@ def feedtype():

 def subscriptions():
    """ list the feed subscriptions """
-    return filter(lambda feed: feed!='Planet' and 
+    return __builtins__['filter'](lambda feed: feed!='Planet' and 
        feed not in template_files()+filters()+reading_lists(),
        parser.sections())

@ -350,6 +352,12 @@ def filters(section=None):
        filters += parser.get('Planet', 'filters').split()
    if section and parser.has_option(section, 'filters'):
        filters += parser.get(section, 'filters').split()
+    if filter(section):
+        filters.append('regexp_sifter.py?require=' +
+            urllib.quote(filter(section)))
+    if exclude(section):
+        filters.append('regexp_sifter.py?exclude=' +
+            urllib.quote(filter(section)))
    return filters

 def planet_options():
--- a/tests/data/filter/regexp-sifter.ini
+++ b/tests/data/filter/regexp-sifter.ini
@ -0,0 +1,2 @@
+[Planet]
+filter=two
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@ -89,6 +89,25 @@ class FilterTests(unittest.TestCase):

        self.assertNotEqual('', output)

+    def test_regexp_filter(self):
+        config.load('tests/data/filter/regexp-sifter.ini')
+
+        testfile = 'tests/data/filter/category-one.xml'
+
+        output = open(testfile).read()
+        for filter in config.filters():
+            output = shell.run(filter, output, mode="filter")
+
+        self.assertEqual('', output)
+
+        testfile = 'tests/data/filter/category-two.xml'
+
+        output = open(testfile).read()
+        for filter in config.filters():
+            output = shell.run(filter, output, mode="filter")
+
+        self.assertNotEqual('', output)
+
 try:
    from subprocess import Popen, PIPE