regexp sifter

This commit is contained in:
Sam Ruby 2007-02-15 19:09:10 -05:00
parent 81d10e1f5c
commit a0642afec0
6 changed files with 86 additions and 3 deletions

View File

@ -61,8 +61,13 @@ material information.</dd>
can be found</dd>
<dt><ins>bill_of_materials</ins></dt>
<dd>Space-separated list of files to be copied as is directly from the <code>template_directories</code> to the <code>output_dir</code></dd>
<dt>filter</dt>
<dd>Regular expression that must be found in the textual portion of the entry</dd>
<dt>exclude</dt>
<dd>Regular expression that must <b>not</b> be found in the textual portion of the entry</dd>
<dt><ins>filters</ins></dt>
<dd>Space-separated list of filters to apply to each entry</dd>
<dd>Space-separated list of <a href="filters.html">filters</a> to apply to
each entry</dd>
</dl>
<dl class="compact code">

View File

@ -46,6 +46,11 @@ expressions. Again, parameters can be passed as
<a href="../tests/data/filter/xpath-sifter2.ini">URI style</a>.
</p>
<p>The <a href="../filters/regexp_sifter.py">regexp sifter</a> operates just
like the xpath sifter, except it uses
<a href="http://docs.python.org/lib/re-syntax.html">regular expressions</a>
instead of XPath expressions.</p>
<h3>Notes</h3>
<ul>

44
filters/regexp_sifter.py Normal file
View File

@ -0,0 +1,44 @@
import sys, re
# parse options
options = dict(zip(sys.argv[1::2],sys.argv[2::2]))
# read entry
doc = data = sys.stdin.read()
# Apply a sequence of patterns which turn a normalized Atom entry into
# a stream of text, after removal of non-human metadata.
for pattern,replacement in [
(re.compile('<id>.*?</id>'),' '),
(re.compile('<url>.*?</url>'),' '),
(re.compile('<source>.*?</source>'),' '),
(re.compile('<updated.*?</updated>'),' '),
(re.compile('<published.*?</published>'),' '),
(re.compile('<link .*?>'),' '),
(re.compile('''<[^>]* alt=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* title=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* label=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('''<[^>]* term=['"]([^'"]*)['"].*?>'''),r' \1 '),
(re.compile('<.*?>'),' '),
(re.compile('\s+'),' '),
(re.compile('&gt;'),'>'),
(re.compile('&lt;'),'<'),
(re.compile('&apos;'),"'"),
(re.compile('&quot;'),'"'),
(re.compile('&amp;'),'&'),
(re.compile('\s+'),' ')
]:
data=pattern.sub(replacement,data)
# process requirements
if options.has_key('--require'):
for regexp in options['--require'].split('\n'):
if regexp and not re.search(regexp,data): sys.exit(1)
# process exclusions
if options.has_key('--exclude'):
for regexp in options['--exclude'].split('\n'):
if regexp and re.search(regexp,data): sys.exit(1)
# if we get this far, the feed is to be included
print doc

View File

@ -26,7 +26,7 @@ Todo:
* error handling (example: no planet section)
"""
import os, sys, re
import os, sys, re, urllib
from ConfigParser import ConfigParser
from urlparse import urljoin
@ -126,6 +126,8 @@ def __init__():
define_tmpl('content_type', '')
define_tmpl('future_dates', 'keep')
define_tmpl('xml_base', '')
define_tmpl('filter', None)
define_tmpl('exclude', None)
def load(config_file):
""" initialize and load a configuration"""
@ -330,7 +332,7 @@ def feedtype():
def subscriptions():
""" list the feed subscriptions """
return filter(lambda feed: feed!='Planet' and
return __builtins__['filter'](lambda feed: feed!='Planet' and
feed not in template_files()+filters()+reading_lists(),
parser.sections())
@ -350,6 +352,12 @@ def filters(section=None):
filters += parser.get('Planet', 'filters').split()
if section and parser.has_option(section, 'filters'):
filters += parser.get(section, 'filters').split()
if filter(section):
filters.append('regexp_sifter.py?require=' +
urllib.quote(filter(section)))
if exclude(section):
filters.append('regexp_sifter.py?exclude=' +
urllib.quote(filter(section)))
return filters
def planet_options():

View File

@ -0,0 +1,2 @@
[Planet]
filter=two

View File

@ -89,6 +89,25 @@ class FilterTests(unittest.TestCase):
self.assertNotEqual('', output)
def test_regexp_filter(self):
config.load('tests/data/filter/regexp-sifter.ini')
testfile = 'tests/data/filter/category-one.xml'
output = open(testfile).read()
for filter in config.filters():
output = shell.run(filter, output, mode="filter")
self.assertEqual('', output)
testfile = 'tests/data/filter/category-two.xml'
output = open(testfile).read()
for filter in config.filters():
output = shell.run(filter, output, mode="filter")
self.assertNotEqual('', output)
try:
from subprocess import Popen, PIPE