regexp sifter

This commit is contained in:
Sam Ruby 2007-02-15 19:09:10 -05:00
parent 81d10e1f5c
commit a0642afec0
6 changed files with 86 additions and 3 deletions

View File

@ -61,8 +61,13 @@ material information.</dd>
can be found</dd> can be found</dd>
<dt><ins>bill_of_materials</ins></dt> <dt><ins>bill_of_materials</ins></dt>
<dd>Space-separated list of files to be copied as is directly from the <code>template_directories</code> to the <code>output_dir</code></dd> <dd>Space-separated list of files to be copied as is directly from the <code>template_directories</code> to the <code>output_dir</code></dd>
<dt>filter</dt>
<dd>Regular expression that must be found in the textual portion of the entry</dd>
<dt>exclude</dt>
<dd>Regular expression that must <b>not</b> be found in the textual portion of the entry</dd>
<dt><ins>filters</ins></dt> <dt><ins>filters</ins></dt>
<dd>Space-separated list of filters to apply to each entry</dd> <dd>Space-separated list of <a href="filters.html">filters</a> to apply to
each entry</dd>
</dl> </dl>
<dl class="compact code"> <dl class="compact code">

View File

@ -46,6 +46,11 @@ expressions. Again, parameters can be passed as
<a href="../tests/data/filter/xpath-sifter2.ini">URI style</a>. <a href="../tests/data/filter/xpath-sifter2.ini">URI style</a>.
</p> </p>
<p>The <a href="../filters/regexp_sifter.py">regexp sifter</a> operates just
like the xpath sifter, except it uses
<a href="http://docs.python.org/lib/re-syntax.html">regular expressions</a>
instead of XPath expressions.</p>
<h3>Notes</h3> <h3>Notes</h3>
<ul> <ul>

44
filters/regexp_sifter.py Normal file
View File

@ -0,0 +1,44 @@
import sys, re
# parse options
options = dict(zip(sys.argv[1::2],sys.argv[2::2]))
# read entry
doc = data = sys.stdin.read()
# Apply a sequence of patterns which turn a normalized Atom entry into
# a stream of text, after removal of non-human metadata.
for pattern,replacement in [
(re.compile('<id>.*?</id>'),' '),
(re.compile('<url>.*?</url>'),' '),
(re.compile('<source>.*?</source>'),' '),
(re.compile('<updated.*?</updated>'),' '),
(re.compile('<published.*?</published>'),' '),
(re.compile('<link .*?>'),' '),
(re.compile('''<[^>]* alt=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* title=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* label=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* term=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('<.*?>'),' '),
(re.compile('\s+'),' '),
(re.compile('&gt;'),'>'),
(re.compile('&lt;'),'<'),
(re.compile('&apos;'),"'"),
(re.compile('&quot;'),'"'),
(re.compile('&amp;'),'&'),
(re.compile('\s+'),' ')
]:
data=pattern.sub(replacement,data)
# process requirements
if options.has_key('--require'):
for regexp in options['--require'].split('\n'):
if regexp and not re.search(regexp,data): sys.exit(1)
# process exclusions
if options.has_key('--exclude'):
for regexp in options['--exclude'].split('\n'):
if regexp and re.search(regexp,data): sys.exit(1)
# if we get this far, the feed is to be included
print doc

View File

@ -26,7 +26,7 @@ Todo:
* error handling (example: no planet section) * error handling (example: no planet section)
""" """
import os, sys, re import os, sys, re, urllib
from ConfigParser import ConfigParser from ConfigParser import ConfigParser
from urlparse import urljoin from urlparse import urljoin
@ -126,6 +126,8 @@ def __init__():
define_tmpl('content_type', '') define_tmpl('content_type', '')
define_tmpl('future_dates', 'keep') define_tmpl('future_dates', 'keep')
define_tmpl('xml_base', '') define_tmpl('xml_base', '')
define_tmpl('filter', None)
define_tmpl('exclude', None)
def load(config_file): def load(config_file):
""" initialize and load a configuration""" """ initialize and load a configuration"""
@ -330,7 +332,7 @@ def feedtype():
def subscriptions(): def subscriptions():
""" list the feed subscriptions """ """ list the feed subscriptions """
return filter(lambda feed: feed!='Planet' and return __builtins__['filter'](lambda feed: feed!='Planet' and
feed not in template_files()+filters()+reading_lists(), feed not in template_files()+filters()+reading_lists(),
parser.sections()) parser.sections())
@ -350,6 +352,12 @@ def filters(section=None):
filters += parser.get('Planet', 'filters').split() filters += parser.get('Planet', 'filters').split()
if section and parser.has_option(section, 'filters'): if section and parser.has_option(section, 'filters'):
filters += parser.get(section, 'filters').split() filters += parser.get(section, 'filters').split()
if filter(section):
filters.append('regexp_sifter.py?require=' +
urllib.quote(filter(section)))
if exclude(section):
filters.append('regexp_sifter.py?exclude=' +
urllib.quote(filter(section)))
return filters return filters
def planet_options(): def planet_options():

View File

@ -0,0 +1,2 @@
[Planet]
filter=two

View File

@ -89,6 +89,25 @@ class FilterTests(unittest.TestCase):
self.assertNotEqual('', output) self.assertNotEqual('', output)
def test_regexp_filter(self):
config.load('tests/data/filter/regexp-sifter.ini')
testfile = 'tests/data/filter/category-one.xml'
output = open(testfile).read()
for filter in config.filters():
output = shell.run(filter, output, mode="filter")
self.assertEqual('', output)
testfile = 'tests/data/filter/category-two.xml'
output = open(testfile).read()
for filter in config.filters():
output = shell.run(filter, output, mode="filter")
self.assertNotEqual('', output)
try: try:
from subprocess import Popen, PIPE from subprocess import Popen, PIPE