From a0642afec002b67eef6976c184161a43c1082330 Mon Sep 17 00:00:00 2001
From: Sam Ruby
Date: Thu, 15 Feb 2007 19:09:10 -0500
Subject: [PATCH] regexp sifter
---
docs/config.html | 7 ++++-
docs/filters.html | 5 ++++
filters/regexp_sifter.py | 44 +++++++++++++++++++++++++++++
planet/config.py | 12 ++++++--
tests/data/filter/regexp-sifter.ini | 2 ++
tests/test_filters.py | 19 +++++++++++++
6 files changed, 86 insertions(+), 3 deletions(-)
create mode 100644 filters/regexp_sifter.py
create mode 100644 tests/data/filter/regexp-sifter.ini
diff --git a/docs/config.html b/docs/config.html
index 9491a29..b1e6550 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -61,8 +61,13 @@ material information.
can be found
bill_of_materials
Space-separated list of files to be copied as is directly from the template_directories
to the output_dir
+filter
+Regular expression that must be found in the textual portion of the entry
+exclude
+Regular expression that must not be found in the textual portion of the entry
filters
-Space-separated list of filters to apply to each entry
+Space-separated list of filters to apply to
+each entry
diff --git a/docs/filters.html b/docs/filters.html
index 2ec73e0..b4a0394 100644
--- a/docs/filters.html
+++ b/docs/filters.html
@@ -46,6 +46,11 @@ expressions. Again, parameters can be passed as
URI style .
+The regexp sifter operates just
+like the xpath sifter, except it uses
+regular expressions
+instead of XPath expressions.
+
Notes
diff --git a/filters/regexp_sifter.py b/filters/regexp_sifter.py
new file mode 100644
index 0000000..a6f7c4f
--- /dev/null
+++ b/filters/regexp_sifter.py
@@ -0,0 +1,44 @@
+import sys, re
+
+# parse options
+options = dict(zip(sys.argv[1::2],sys.argv[2::2]))
+
+# read entry
+doc = data = sys.stdin.read()
+
+# Apply a sequence of patterns which turn a normalized Atom entry into
+# a stream of text, after removal of non-human metadata.
+for pattern,replacement in [
+ (re.compile('.*? '),' '),
+ (re.compile('.*? '),' '),
+ (re.compile('.*? '),' '),
+ (re.compile(''),' '),
+ (re.compile(''),' '),
+ (re.compile(' '),' '),
+ (re.compile('''<[^>]* alt=['"]([^'"]*)['"].*?>'''),r' \1 '),
+ (re.compile('''<[^>]* title=['"]([^'"]*)['"].*?>'''),r' \1 '),
+ (re.compile('''<[^>]* label=['"]([^'"]*)['"].*?>'''),r' \1 '),
+ (re.compile('''<[^>]* term=['"]([^'"]*)['"].*?>'''),r' \1 '),
+ (re.compile('<.*?>'),' '),
+ (re.compile('\s+'),' '),
+ (re.compile('>'),'>'),
+ (re.compile('<'),'<'),
+ (re.compile('''),"'"),
+ (re.compile('"'),'"'),
+ (re.compile('&'),'&'),
+ (re.compile('\s+'),' ')
+]:
+ data=pattern.sub(replacement,data)
+
+# process requirements
+if options.has_key('--require'):
+ for regexp in options['--require'].split('\n'):
+ if regexp and not re.search(regexp,data): sys.exit(1)
+
+# process exclusions
+if options.has_key('--exclude'):
+ for regexp in options['--exclude'].split('\n'):
+ if regexp and re.search(regexp,data): sys.exit(1)
+
+# if we get this far, the feed is to be included
+print doc
diff --git a/planet/config.py b/planet/config.py
index 0d3b605..669dd68 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -26,7 +26,7 @@ Todo:
* error handling (example: no planet section)
"""
-import os, sys, re
+import os, sys, re, urllib
from ConfigParser import ConfigParser
from urlparse import urljoin
@@ -126,6 +126,8 @@ def __init__():
define_tmpl('content_type', '')
define_tmpl('future_dates', 'keep')
define_tmpl('xml_base', '')
+ define_tmpl('filter', None)
+ define_tmpl('exclude', None)
def load(config_file):
""" initialize and load a configuration"""
@@ -330,7 +332,7 @@ def feedtype():
def subscriptions():
""" list the feed subscriptions """
- return filter(lambda feed: feed!='Planet' and
+ return __builtins__['filter'](lambda feed: feed!='Planet' and
feed not in template_files()+filters()+reading_lists(),
parser.sections())
@@ -350,6 +352,12 @@ def filters(section=None):
filters += parser.get('Planet', 'filters').split()
if section and parser.has_option(section, 'filters'):
filters += parser.get(section, 'filters').split()
+ if filter(section):
+ filters.append('regexp_sifter.py?require=' +
+ urllib.quote(filter(section)))
+ if exclude(section):
+ filters.append('regexp_sifter.py?exclude=' +
+ urllib.quote(filter(section)))
return filters
def planet_options():
diff --git a/tests/data/filter/regexp-sifter.ini b/tests/data/filter/regexp-sifter.ini
new file mode 100644
index 0000000..a093210
--- /dev/null
+++ b/tests/data/filter/regexp-sifter.ini
@@ -0,0 +1,2 @@
+[Planet]
+filter=two
diff --git a/tests/test_filters.py b/tests/test_filters.py
index 55361eb..fc61e47 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -89,6 +89,25 @@ class FilterTests(unittest.TestCase):
self.assertNotEqual('', output)
+ def test_regexp_filter(self):
+ config.load('tests/data/filter/regexp-sifter.ini')
+
+ testfile = 'tests/data/filter/category-one.xml'
+
+ output = open(testfile).read()
+ for filter in config.filters():
+ output = shell.run(filter, output, mode="filter")
+
+ self.assertEqual('', output)
+
+ testfile = 'tests/data/filter/category-two.xml'
+
+ output = open(testfile).read()
+ for filter in config.filters():
+ output = shell.run(filter, output, mode="filter")
+
+ self.assertNotEqual('', output)
+
try:
from subprocess import Popen, PIPE