regexp sifter
This commit is contained in:
parent
81d10e1f5c
commit
a0642afec0
@ -61,8 +61,13 @@ material information.</dd>
|
|||||||
can be found</dd>
|
can be found</dd>
|
||||||
<dt><ins>bill_of_materials</ins></dt>
|
<dt><ins>bill_of_materials</ins></dt>
|
||||||
<dd>Space-separated list of files to be copied as is directly from the <code>template_directories</code> to the <code>output_dir</code></dd>
|
<dd>Space-separated list of files to be copied as is directly from the <code>template_directories</code> to the <code>output_dir</code></dd>
|
||||||
|
<dt>filter</dt>
|
||||||
|
<dd>Regular expression that must be found in the textual portion of the entry</dd>
|
||||||
|
<dt>exclude</dt>
|
||||||
|
<dd>Regular expression that must <b>not</b> be found in the textual portion of the entry</dd>
|
||||||
<dt><ins>filters</ins></dt>
|
<dt><ins>filters</ins></dt>
|
||||||
<dd>Space-separated list of filters to apply to each entry</dd>
|
<dd>Space-separated list of <a href="filters.html">filters</a> to apply to
|
||||||
|
each entry</dd>
|
||||||
|
|
||||||
</dl>
|
</dl>
|
||||||
<dl class="compact code">
|
<dl class="compact code">
|
||||||
|
@ -46,6 +46,11 @@ expressions. Again, parameters can be passed as
|
|||||||
<a href="../tests/data/filter/xpath-sifter2.ini">URI style</a>.
|
<a href="../tests/data/filter/xpath-sifter2.ini">URI style</a>.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
<p>The <a href="../filters/regexp_sifter.py">regexp sifter</a> operates just
|
||||||
|
like the xpath sifter, except it uses
|
||||||
|
<a href="http://docs.python.org/lib/re-syntax.html">regular expressions</a>
|
||||||
|
instead of XPath expressions.</p>
|
||||||
|
|
||||||
<h3>Notes</h3>
|
<h3>Notes</h3>
|
||||||
|
|
||||||
<ul>
|
<ul>
|
||||||
|
44
filters/regexp_sifter.py
Normal file
44
filters/regexp_sifter.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
import sys, re
|
||||||
|
|
||||||
|
# parse options
|
||||||
|
options = dict(zip(sys.argv[1::2],sys.argv[2::2]))
|
||||||
|
|
||||||
|
# read entry
|
||||||
|
doc = data = sys.stdin.read()
|
||||||
|
|
||||||
|
# Apply a sequence of patterns which turn a normalized Atom entry into
|
||||||
|
# a stream of text, after removal of non-human metadata.
|
||||||
|
for pattern,replacement in [
|
||||||
|
(re.compile('<id>.*?</id>'),' '),
|
||||||
|
(re.compile('<url>.*?</url>'),' '),
|
||||||
|
(re.compile('<source>.*?</source>'),' '),
|
||||||
|
(re.compile('<updated.*?</updated>'),' '),
|
||||||
|
(re.compile('<published.*?</published>'),' '),
|
||||||
|
(re.compile('<link .*?>'),' '),
|
||||||
|
(re.compile('''<[^>]* alt=['"]([^'"]*)['"].*?>'''),r' \1 '),
|
||||||
|
(re.compile('''<[^>]* title=['"]([^'"]*)['"].*?>'''),r' \1 '),
|
||||||
|
(re.compile('''<[^>]* label=['"]([^'"]*)['"].*?>'''),r' \1 '),
|
||||||
|
(re.compile('''<[^>]* term=['"]([^'"]*)['"].*?>'''),r' \1 '),
|
||||||
|
(re.compile('<.*?>'),' '),
|
||||||
|
(re.compile('\s+'),' '),
|
||||||
|
(re.compile('>'),'>'),
|
||||||
|
(re.compile('<'),'<'),
|
||||||
|
(re.compile('''),"'"),
|
||||||
|
(re.compile('"'),'"'),
|
||||||
|
(re.compile('&'),'&'),
|
||||||
|
(re.compile('\s+'),' ')
|
||||||
|
]:
|
||||||
|
data=pattern.sub(replacement,data)
|
||||||
|
|
||||||
|
# process requirements
|
||||||
|
if options.has_key('--require'):
|
||||||
|
for regexp in options['--require'].split('\n'):
|
||||||
|
if regexp and not re.search(regexp,data): sys.exit(1)
|
||||||
|
|
||||||
|
# process exclusions
|
||||||
|
if options.has_key('--exclude'):
|
||||||
|
for regexp in options['--exclude'].split('\n'):
|
||||||
|
if regexp and re.search(regexp,data): sys.exit(1)
|
||||||
|
|
||||||
|
# if we get this far, the feed is to be included
|
||||||
|
print doc
|
@ -26,7 +26,7 @@ Todo:
|
|||||||
* error handling (example: no planet section)
|
* error handling (example: no planet section)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os, sys, re
|
import os, sys, re, urllib
|
||||||
from ConfigParser import ConfigParser
|
from ConfigParser import ConfigParser
|
||||||
from urlparse import urljoin
|
from urlparse import urljoin
|
||||||
|
|
||||||
@ -126,6 +126,8 @@ def __init__():
|
|||||||
define_tmpl('content_type', '')
|
define_tmpl('content_type', '')
|
||||||
define_tmpl('future_dates', 'keep')
|
define_tmpl('future_dates', 'keep')
|
||||||
define_tmpl('xml_base', '')
|
define_tmpl('xml_base', '')
|
||||||
|
define_tmpl('filter', None)
|
||||||
|
define_tmpl('exclude', None)
|
||||||
|
|
||||||
def load(config_file):
|
def load(config_file):
|
||||||
""" initialize and load a configuration"""
|
""" initialize and load a configuration"""
|
||||||
@ -330,7 +332,7 @@ def feedtype():
|
|||||||
|
|
||||||
def subscriptions():
|
def subscriptions():
|
||||||
""" list the feed subscriptions """
|
""" list the feed subscriptions """
|
||||||
return filter(lambda feed: feed!='Planet' and
|
return __builtins__['filter'](lambda feed: feed!='Planet' and
|
||||||
feed not in template_files()+filters()+reading_lists(),
|
feed not in template_files()+filters()+reading_lists(),
|
||||||
parser.sections())
|
parser.sections())
|
||||||
|
|
||||||
@ -350,6 +352,12 @@ def filters(section=None):
|
|||||||
filters += parser.get('Planet', 'filters').split()
|
filters += parser.get('Planet', 'filters').split()
|
||||||
if section and parser.has_option(section, 'filters'):
|
if section and parser.has_option(section, 'filters'):
|
||||||
filters += parser.get(section, 'filters').split()
|
filters += parser.get(section, 'filters').split()
|
||||||
|
if filter(section):
|
||||||
|
filters.append('regexp_sifter.py?require=' +
|
||||||
|
urllib.quote(filter(section)))
|
||||||
|
if exclude(section):
|
||||||
|
filters.append('regexp_sifter.py?exclude=' +
|
||||||
|
urllib.quote(filter(section)))
|
||||||
return filters
|
return filters
|
||||||
|
|
||||||
def planet_options():
|
def planet_options():
|
||||||
|
2
tests/data/filter/regexp-sifter.ini
Normal file
2
tests/data/filter/regexp-sifter.ini
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
[Planet]
|
||||||
|
filter=two
|
@ -89,6 +89,25 @@ class FilterTests(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertNotEqual('', output)
|
self.assertNotEqual('', output)
|
||||||
|
|
||||||
|
def test_regexp_filter(self):
|
||||||
|
config.load('tests/data/filter/regexp-sifter.ini')
|
||||||
|
|
||||||
|
testfile = 'tests/data/filter/category-one.xml'
|
||||||
|
|
||||||
|
output = open(testfile).read()
|
||||||
|
for filter in config.filters():
|
||||||
|
output = shell.run(filter, output, mode="filter")
|
||||||
|
|
||||||
|
self.assertEqual('', output)
|
||||||
|
|
||||||
|
testfile = 'tests/data/filter/category-two.xml'
|
||||||
|
|
||||||
|
output = open(testfile).read()
|
||||||
|
for filter in config.filters():
|
||||||
|
output = shell.run(filter, output, mode="filter")
|
||||||
|
|
||||||
|
self.assertNotEqual('', output)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from subprocess import Popen, PIPE
|
from subprocess import Popen, PIPE
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user