added a new config option: future_dates
This commit is contained in:
parent
e407a8d6f2
commit
6cc797ce0a
@ -101,16 +101,14 @@ use for logging output. Note: this configuration value is processed
|
||||
<dt><ins>spider_threads</ins></dt>
|
||||
<dd>The number of threads to use when spidering. When set to 0, the default,
|
||||
no threads are used and spidering follows the traditional algorithm.</dd>
|
||||
<dt><ins>spider_threads</ins></dt>
|
||||
<dd>The number of threads to use when spidering. When set to 0, the default,
|
||||
no threads are used and spidering follows the traditional algorithm.</dd>
|
||||
<dt><ins>http_cache_directory</ins></dt>
|
||||
<dd>If <code>spider_threads</code> is specified, you can also specify a
|
||||
directory to be used for an additional HTTP cache to front end the Venus
|
||||
cache. If specified as a relative path, it is evaluated relative to the
|
||||
<code>cache_directory</code>.</dd>
|
||||
<code>
|
||||
</dl>
|
||||
<p>Additional options can be found in
|
||||
<a href="normalization.html#overrides">normalization level overrides</a>.</p>
|
||||
</blockquote>
|
||||
|
||||
<h3 id="default"><code>[DEFAULT]</code></h3>
|
||||
|
@ -69,8 +69,9 @@ are converted into
|
||||
<li><a href="http://www.feedparser.org/docs/reference-entry-content.html">content</a></li>
|
||||
</ul>
|
||||
<p>If no <a href="http://www.feedparser.org/docs/reference-feed-
|
||||
updated.html">updated</a> dates are found in an entry, or if the dates found
|
||||
are in the future, the current time is substituted.</p>
|
||||
updated.html">updated</a> dates are found in an entry, the updated date from
|
||||
the feed is used. If no updated date is found in either the feed or
|
||||
the entry, the current time is substituted.</p>
|
||||
<h3 id="overrides">Overrides</h3>
|
||||
<p>All of the above describes what Venus does automatically, either directly
|
||||
or through its dependencies. There are a number of errors which can not
|
||||
@ -87,6 +88,13 @@ case of feeds where the <code>id</code>, <code>updated</code> or
|
||||
attributes on these elements.</li>
|
||||
<li><code>name_type</code> does something similar for
|
||||
<a href="http://www.feedparser.org/docs/reference-entry-author_detail.html#reference.entry.author_detail.name">author names</a></li>
|
||||
<li><code>future_dates</code> allows you to specify how to deal with dates which are in the future.
|
||||
<ul style="margin:0">
|
||||
<li><code>ignore_date</code> will cause the date to be ignored (and will therefore default to the time the entry was first seen) until the feed is updated and the time indicated is past, at which point the entry will be updated with the new date.</li>
|
||||
<li><code>ignore_entry</code> will cause the entire entry containing the future date to be ignored until the date is past.</li>
|
||||
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -118,6 +118,7 @@ def __init__():
|
||||
define_tmpl('title_type', '')
|
||||
define_tmpl('summary_type', '')
|
||||
define_tmpl('content_type', '')
|
||||
define_tmpl('future_dates', 'keep')
|
||||
|
||||
def load(config_file):
|
||||
""" initialize and load a configuration"""
|
||||
|
94
planet/scrub.py
Normal file
94
planet/scrub.py
Normal file
@ -0,0 +1,94 @@
|
||||
"""
|
||||
Process a set of configuration defined sanitations on a given feed.
|
||||
"""
|
||||
|
||||
# Standard library modules
|
||||
import time
|
||||
# Planet modules
|
||||
import planet, config, shell
|
||||
|
||||
type_map = {'text': 'text/plain', 'html': 'text/html',
|
||||
'xhtml': 'application/xhtml+xml'}
|
||||
|
||||
def scrub(feed_uri, data):
|
||||
|
||||
# some data is not trustworthy
|
||||
for tag in config.ignore_in_feed(feed_uri).split():
|
||||
if tag.find('lang')>=0: tag='language'
|
||||
if data.feed.has_key(tag): del data.feed[tag]
|
||||
for entry in data.entries:
|
||||
if entry.has_key(tag): del entry[tag]
|
||||
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
|
||||
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
|
||||
for key in entry.keys():
|
||||
if not key.endswith('_detail'): continue
|
||||
for detail in entry[key].copy():
|
||||
if detail == tag: del entry[key][detail]
|
||||
|
||||
# adjust title types
|
||||
if config.title_type(feed_uri):
|
||||
title_type = config.title_type(feed_uri)
|
||||
title_type = type_map.get(title_type, title_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('title_detail'):
|
||||
entry.title_detail['type'] = title_type
|
||||
|
||||
# adjust summary types
|
||||
if config.summary_type(feed_uri):
|
||||
summary_type = config.summary_type(feed_uri)
|
||||
summary_type = type_map.get(summary_type, summary_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('summary_detail'):
|
||||
entry.summary_detail['type'] = summary_type
|
||||
|
||||
# adjust content types
|
||||
if config.content_type(feed_uri):
|
||||
content_type = config.content_type(feed_uri)
|
||||
content_type = type_map.get(content_type, content_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('content'):
|
||||
entry.content[0]['type'] = content_type
|
||||
|
||||
# some people put html in author names
|
||||
if config.name_type(feed_uri).find('html')>=0:
|
||||
from shell.tmpl import stripHtml
|
||||
if data.feed.has_key('author_detail') and \
|
||||
data.feed.author_detail.has_key('name'):
|
||||
data.feed.author_detail['name'] = \
|
||||
str(stripHtml(data.feed.author_detail.name))
|
||||
for entry in data.entries:
|
||||
if entry.has_key('author_detail') and \
|
||||
entry.author_detail.has_key('name'):
|
||||
entry.author_detail['name'] = \
|
||||
str(stripHtml(entry.author_detail.name))
|
||||
if entry.has_key('source'):
|
||||
source = entry.source
|
||||
if source.has_key('author_detail') and \
|
||||
source.author_detail.has_key('name'):
|
||||
source.author_detail['name'] = \
|
||||
str(stripHtml(source.author_detail.name))
|
||||
|
||||
# handle dates in the future
|
||||
future_dates = config.future_dates(feed_uri).lower()
|
||||
if future_dates == 'ignore_date':
|
||||
now = time.gmtime()
|
||||
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
|
||||
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
|
||||
for entry in data.entries:
|
||||
if entry.has_key('published_parsed') and entry['published_parsed']:
|
||||
if entry['published_parsed'] > now:
|
||||
del entry['published_parsed']
|
||||
del entry['published']
|
||||
if entry.has_key('updated_parsed') and entry['updated_parsed']:
|
||||
if entry['updated_parsed'] > now:
|
||||
del entry['updated_parsed']
|
||||
del entry['updated']
|
||||
elif future_dates == 'ignore_entry':
|
||||
now = time.time()
|
||||
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
|
||||
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
|
||||
data.entries = [entry for entry in data.entries if
|
||||
(not entry.has_key('published_parsed') or not entry['published_parsed']
|
||||
or entry['published_parsed'] <= now) and
|
||||
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
|
||||
or entry['updated_parsed'] <= now)]
|
@ -7,7 +7,7 @@ and write each as a set of entries in a cache directory.
|
||||
import time, calendar, re, os, urlparse
|
||||
from xml.dom import minidom
|
||||
# Planet modules
|
||||
import planet, config, feedparser, reconstitute, shell, socket
|
||||
import planet, config, feedparser, reconstitute, shell, socket, scrub
|
||||
from StringIO import StringIO
|
||||
|
||||
# Regular expressions to sanitise cache filenames
|
||||
@ -57,66 +57,6 @@ def write(xdoc, out):
|
||||
file.write(xdoc)
|
||||
file.close()
|
||||
|
||||
type_map = {'text': 'text/plain', 'html': 'text/html',
|
||||
'xhtml': 'application/xhtml+xml'}
|
||||
|
||||
def scrub(feed, data):
|
||||
|
||||
# some data is not trustworthy
|
||||
for tag in config.ignore_in_feed(feed).split():
|
||||
if tag.find('lang')>=0: tag='language'
|
||||
if data.feed.has_key(tag): del data.feed[tag]
|
||||
for entry in data.entries:
|
||||
if entry.has_key(tag): del entry[tag]
|
||||
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
|
||||
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
|
||||
for key in entry.keys():
|
||||
if not key.endswith('_detail'): continue
|
||||
for detail in entry[key].copy():
|
||||
if detail == tag: del entry[key][detail]
|
||||
|
||||
# adjust title types
|
||||
if config.title_type(feed):
|
||||
title_type = config.title_type(feed)
|
||||
title_type = type_map.get(title_type, title_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('title_detail'):
|
||||
entry.title_detail['type'] = title_type
|
||||
|
||||
# adjust summary types
|
||||
if config.summary_type(feed):
|
||||
summary_type = config.summary_type(feed)
|
||||
summary_type = type_map.get(summary_type, summary_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('summary_detail'):
|
||||
entry.summary_detail['type'] = summary_type
|
||||
|
||||
# adjust content types
|
||||
if config.content_type(feed):
|
||||
content_type = config.content_type(feed)
|
||||
content_type = type_map.get(content_type, content_type)
|
||||
for entry in data.entries:
|
||||
if entry.has_key('content'):
|
||||
entry.content[0]['type'] = content_type
|
||||
|
||||
# some people put html in author names
|
||||
if config.name_type(feed).find('html')>=0:
|
||||
from planet.shell.tmpl import stripHtml
|
||||
if data.feed.has_key('author_detail') and \
|
||||
data.feed.author_detail.has_key('name'):
|
||||
data.feed.author_detail['name'] = \
|
||||
str(stripHtml(data.feed.author_detail.name))
|
||||
for entry in data.entries:
|
||||
if entry.has_key('author_detail') and \
|
||||
entry.author_detail.has_key('name'):
|
||||
entry.author_detail['name'] = \
|
||||
str(stripHtml(entry.author_detail.name))
|
||||
if entry.has_key('source'):
|
||||
source = entry.source
|
||||
if source.has_key('author_detail') and \
|
||||
source.author_detail.has_key('name'):
|
||||
source.author_detail['name'] = \
|
||||
str(stripHtml(source.author_detail.name))
|
||||
def _is_http_uri(uri):
|
||||
parsed = urlparse.urlparse(uri)
|
||||
return parsed[0] in ['http', 'https']
|
||||
@ -209,7 +149,7 @@ def writeCache(feed_uri, feed_info, data):
|
||||
data.feed['planet_'+name] = value
|
||||
|
||||
# perform user configured scrub operations on the data
|
||||
scrub(feed_uri, data)
|
||||
scrub.scrub(feed_uri, data)
|
||||
|
||||
from planet import idindex
|
||||
global index
|
||||
@ -244,7 +184,7 @@ def writeCache(feed_uri, feed_info, data):
|
||||
mtime = calendar.timegm(data.feed.updated_parsed)
|
||||
except:
|
||||
pass
|
||||
if not mtime or mtime > time.time(): mtime = time.time()
|
||||
if not mtime: mtime = time.time()
|
||||
entry['updated_parsed'] = time.gmtime(mtime)
|
||||
|
||||
# apply any filters
|
||||
|
@ -1,7 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import unittest, StringIO
|
||||
from planet.spider import scrub
|
||||
import unittest, StringIO, time
|
||||
from copy import deepcopy
|
||||
from planet.scrub import scrub
|
||||
from planet import feedparser, config
|
||||
|
||||
feed = '''
|
||||
@ -10,7 +11,7 @@ feed = '''
|
||||
<entry xml:lang="en">
|
||||
<id>ignoreme</id>
|
||||
<author><name>F&ouml;o</name></author>
|
||||
<updated>2000-01-01T00:00:00Z</updated>
|
||||
<updated>%d-12-31T23:59:59Z</updated>
|
||||
<title>F&ouml;o</title>
|
||||
<summary>F&ouml;o</summary>
|
||||
<content>F&ouml;o</content>
|
||||
@ -19,11 +20,10 @@ feed = '''
|
||||
</source>
|
||||
</entry>
|
||||
</feed>
|
||||
'''
|
||||
''' % (time.gmtime()[0] + 1)
|
||||
|
||||
configData = '''
|
||||
[testfeed]
|
||||
ignore_in_feed = id updated xml:lang
|
||||
name_type = html
|
||||
title_type = html
|
||||
summary_type = html
|
||||
@ -32,16 +32,17 @@ content_type = html
|
||||
|
||||
class ScrubTest(unittest.TestCase):
|
||||
|
||||
def test_scrub(self):
|
||||
data = feedparser.parse(feed)
|
||||
def test_scrub_ignore(self):
|
||||
base = feedparser.parse(feed)
|
||||
|
||||
self.assertTrue(base.entries[0].has_key('id'))
|
||||
self.assertTrue(base.entries[0].has_key('updated'))
|
||||
self.assertTrue(base.entries[0].has_key('updated_parsed'))
|
||||
self.assertTrue(base.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
|
||||
self.assertEqual('Föo', data.feed.author_detail.name)
|
||||
self.assertTrue(data.entries[0].has_key('id'))
|
||||
self.assertTrue(data.entries[0].has_key('updated'))
|
||||
self.assertTrue(data.entries[0].has_key('updated_parsed'))
|
||||
self.assertTrue(data.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
|
||||
self.assertFalse(data.entries[0].has_key('id'))
|
||||
@ -49,6 +50,15 @@ class ScrubTest(unittest.TestCase):
|
||||
self.assertFalse(data.entries[0].has_key('updated_parsed'))
|
||||
self.assertFalse(data.entries[0].summary_detail.has_key('language'))
|
||||
|
||||
def test_scrub_type(self):
|
||||
base = feedparser.parse(feed)
|
||||
|
||||
self.assertEqual('Föo', base.feed.author_detail.name)
|
||||
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
|
||||
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
|
||||
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
|
||||
self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
|
||||
@ -57,3 +67,18 @@ class ScrubTest(unittest.TestCase):
|
||||
self.assertEqual('text/html', data.entries[0].summary_detail.type)
|
||||
self.assertEqual('text/html', data.entries[0].content[0].type)
|
||||
|
||||
def test_scrub_future(self):
|
||||
base = feedparser.parse(feed)
|
||||
self.assertEqual(1, len(base.entries))
|
||||
self.assertTrue(base.entries[0].has_key('updated'))
|
||||
|
||||
config.parser.readfp(StringIO.StringIO(configData))
|
||||
config.parser.set('testfeed', 'future_dates', 'ignore_date')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertFalse(data.entries[0].has_key('updated'))
|
||||
|
||||
config.parser.set('testfeed', 'future_dates', 'ignore_entry')
|
||||
data = deepcopy(base)
|
||||
scrub('testfeed', data)
|
||||
self.assertEqual(0, len(data.entries))
|
||||
|
Loading…
Reference in New Issue
Block a user