added a new config option: future_dates

This commit is contained in:
Sam Ruby 2006-12-07 18:31:45 -05:00
parent e407a8d6f2
commit 6cc797ce0a
6 changed files with 149 additions and 83 deletions

View File

@ -101,16 +101,14 @@ use for logging output. Note: this configuration value is processed
<dt><ins>spider_threads</ins></dt>
<dd>The number of threads to use when spidering. When set to 0, the default,
no threads are used and spidering follows the traditional algorithm.</dd>
<dt><ins>spider_threads</ins></dt>
<dd>The number of threads to use when spidering. When set to 0, the default,
no threads are used and spidering follows the traditional algorithm.</dd>
<dt><ins>http_cache_directory</ins></dt>
<dd>If <code>spider_threads</code> is specified, you can also specify a
directory to be used for an additional HTTP cache to front end the Venus
cache. If specified as a relative path, it is evaluated relative to the
<code>cache_directory</code>.</dd>
<code>
</dl>
<p>Additional options can be found in
<a href="normalization.html#overrides">normalization level overrides</a>.</p>
</blockquote>
<h3 id="default"><code>[DEFAULT]</code></h3>

View File

@ -69,8 +69,9 @@ are converted into
<li><a href="http://www.feedparser.org/docs/reference-entry-content.html">content</a></li>
</ul>
<p>If no <a href="http://www.feedparser.org/docs/reference-feed-
updated.html">updated</a> dates are found in an entry, or if the dates found
are in the future, the current time is substituted.</p>
updated.html">updated</a> dates are found in an entry, the updated date from
the feed is used. If no updated date is found in either the feed or
the entry, the current time is substituted.</p>
<h3 id="overrides">Overrides</h3>
<p>All of the above describes what Venus does automatically, either directly
or through its dependencies. There are a number of errors which can not
@ -87,6 +88,13 @@ case of feeds where the <code>id</code>, <code>updated</code> or
attributes on these elements.</li>
<li><code>name_type</code> does something similar for
<a href="http://www.feedparser.org/docs/reference-entry-author_detail.html#reference.entry.author_detail.name">author names</a></li>
<li><code>future_dates</code> allows you to specify how to deal with dates which are in the future.
<ul style="margin:0">
<li><code>ignore_date</code> will cause the date to be ignored (and will therefore default to the time the entry was first seen) until the feed is updated and the time indicated is past, at which point the entry will be updated with the new date.</li>
<li><code>ignore_entry</code> will cause the entire entry containing the future date to be ignored until the date is past.</li>
<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
</ul>
</li>
</ul>
</body>
</html>

View File

@ -118,6 +118,7 @@ def __init__():
define_tmpl('title_type', '')
define_tmpl('summary_type', '')
define_tmpl('content_type', '')
define_tmpl('future_dates', 'keep')
def load(config_file):
""" initialize and load a configuration"""

94
planet/scrub.py Normal file
View File

@ -0,0 +1,94 @@
"""
Process a set of configuration defined sanitations on a given feed.
"""
# Standard library modules
import time
# Planet modules
import planet, config, shell
type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'}
def scrub(feed_uri, data):
# some data is not trustworthy
for tag in config.ignore_in_feed(feed_uri).split():
if tag.find('lang')>=0: tag='language'
if data.feed.has_key(tag): del data.feed[tag]
for entry in data.entries:
if entry.has_key(tag): del entry[tag]
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
for key in entry.keys():
if not key.endswith('_detail'): continue
for detail in entry[key].copy():
if detail == tag: del entry[key][detail]
# adjust title types
if config.title_type(feed_uri):
title_type = config.title_type(feed_uri)
title_type = type_map.get(title_type, title_type)
for entry in data.entries:
if entry.has_key('title_detail'):
entry.title_detail['type'] = title_type
# adjust summary types
if config.summary_type(feed_uri):
summary_type = config.summary_type(feed_uri)
summary_type = type_map.get(summary_type, summary_type)
for entry in data.entries:
if entry.has_key('summary_detail'):
entry.summary_detail['type'] = summary_type
# adjust content types
if config.content_type(feed_uri):
content_type = config.content_type(feed_uri)
content_type = type_map.get(content_type, content_type)
for entry in data.entries:
if entry.has_key('content'):
entry.content[0]['type'] = content_type
# some people put html in author names
if config.name_type(feed_uri).find('html')>=0:
from shell.tmpl import stripHtml
if data.feed.has_key('author_detail') and \
data.feed.author_detail.has_key('name'):
data.feed.author_detail['name'] = \
str(stripHtml(data.feed.author_detail.name))
for entry in data.entries:
if entry.has_key('author_detail') and \
entry.author_detail.has_key('name'):
entry.author_detail['name'] = \
str(stripHtml(entry.author_detail.name))
if entry.has_key('source'):
source = entry.source
if source.has_key('author_detail') and \
source.author_detail.has_key('name'):
source.author_detail['name'] = \
str(stripHtml(source.author_detail.name))
# handle dates in the future
future_dates = config.future_dates(feed_uri).lower()
if future_dates == 'ignore_date':
now = time.gmtime()
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
for entry in data.entries:
if entry.has_key('published_parsed') and entry['published_parsed']:
if entry['published_parsed'] > now:
del entry['published_parsed']
del entry['published']
if entry.has_key('updated_parsed') and entry['updated_parsed']:
if entry['updated_parsed'] > now:
del entry['updated_parsed']
del entry['updated']
elif future_dates == 'ignore_entry':
now = time.time()
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
data.entries = [entry for entry in data.entries if
(not entry.has_key('published_parsed') or not entry['published_parsed']
or entry['published_parsed'] <= now) and
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
or entry['updated_parsed'] <= now)]

View File

@ -7,7 +7,7 @@ and write each as a set of entries in a cache directory.
import time, calendar, re, os, urlparse
from xml.dom import minidom
# Planet modules
import planet, config, feedparser, reconstitute, shell, socket
import planet, config, feedparser, reconstitute, shell, socket, scrub
from StringIO import StringIO
# Regular expressions to sanitise cache filenames
@ -57,66 +57,6 @@ def write(xdoc, out):
file.write(xdoc)
file.close()
type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'}
def scrub(feed, data):
# some data is not trustworthy
for tag in config.ignore_in_feed(feed).split():
if tag.find('lang')>=0: tag='language'
if data.feed.has_key(tag): del data.feed[tag]
for entry in data.entries:
if entry.has_key(tag): del entry[tag]
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
for key in entry.keys():
if not key.endswith('_detail'): continue
for detail in entry[key].copy():
if detail == tag: del entry[key][detail]
# adjust title types
if config.title_type(feed):
title_type = config.title_type(feed)
title_type = type_map.get(title_type, title_type)
for entry in data.entries:
if entry.has_key('title_detail'):
entry.title_detail['type'] = title_type
# adjust summary types
if config.summary_type(feed):
summary_type = config.summary_type(feed)
summary_type = type_map.get(summary_type, summary_type)
for entry in data.entries:
if entry.has_key('summary_detail'):
entry.summary_detail['type'] = summary_type
# adjust content types
if config.content_type(feed):
content_type = config.content_type(feed)
content_type = type_map.get(content_type, content_type)
for entry in data.entries:
if entry.has_key('content'):
entry.content[0]['type'] = content_type
# some people put html in author names
if config.name_type(feed).find('html')>=0:
from planet.shell.tmpl import stripHtml
if data.feed.has_key('author_detail') and \
data.feed.author_detail.has_key('name'):
data.feed.author_detail['name'] = \
str(stripHtml(data.feed.author_detail.name))
for entry in data.entries:
if entry.has_key('author_detail') and \
entry.author_detail.has_key('name'):
entry.author_detail['name'] = \
str(stripHtml(entry.author_detail.name))
if entry.has_key('source'):
source = entry.source
if source.has_key('author_detail') and \
source.author_detail.has_key('name'):
source.author_detail['name'] = \
str(stripHtml(source.author_detail.name))
def _is_http_uri(uri):
parsed = urlparse.urlparse(uri)
return parsed[0] in ['http', 'https']
@ -209,7 +149,7 @@ def writeCache(feed_uri, feed_info, data):
data.feed['planet_'+name] = value
# perform user configured scrub operations on the data
scrub(feed_uri, data)
scrub.scrub(feed_uri, data)
from planet import idindex
global index
@ -244,7 +184,7 @@ def writeCache(feed_uri, feed_info, data):
mtime = calendar.timegm(data.feed.updated_parsed)
except:
pass
if not mtime or mtime > time.time(): mtime = time.time()
if not mtime: mtime = time.time()
entry['updated_parsed'] = time.gmtime(mtime)
# apply any filters

View File

@ -1,7 +1,8 @@
#!/usr/bin/env python
import unittest, StringIO
from planet.spider import scrub
import unittest, StringIO, time
from copy import deepcopy
from planet.scrub import scrub
from planet import feedparser, config
feed = '''
@ -10,7 +11,7 @@ feed = '''
<entry xml:lang="en">
<id>ignoreme</id>
<author><name>F&amp;ouml;o</name></author>
<updated>2000-01-01T00:00:00Z</updated>
<updated>%d-12-31T23:59:59Z</updated>
<title>F&amp;ouml;o</title>
<summary>F&amp;ouml;o</summary>
<content>F&amp;ouml;o</content>
@ -19,11 +20,10 @@ feed = '''
</source>
</entry>
</feed>
'''
''' % (time.gmtime()[0] + 1)
configData = '''
[testfeed]
ignore_in_feed = id updated xml:lang
name_type = html
title_type = html
summary_type = html
@ -32,16 +32,17 @@ content_type = html
class ScrubTest(unittest.TestCase):
def test_scrub(self):
data = feedparser.parse(feed)
def test_scrub_ignore(self):
base = feedparser.parse(feed)
self.assertTrue(base.entries[0].has_key('id'))
self.assertTrue(base.entries[0].has_key('updated'))
self.assertTrue(base.entries[0].has_key('updated_parsed'))
self.assertTrue(base.entries[0].summary_detail.has_key('language'))
config.parser.readfp(StringIO.StringIO(configData))
self.assertEqual('F&ouml;o', data.feed.author_detail.name)
self.assertTrue(data.entries[0].has_key('id'))
self.assertTrue(data.entries[0].has_key('updated'))
self.assertTrue(data.entries[0].has_key('updated_parsed'))
self.assertTrue(data.entries[0].summary_detail.has_key('language'))
config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang')
data = deepcopy(base)
scrub('testfeed', data)
self.assertFalse(data.entries[0].has_key('id'))
@ -49,6 +50,15 @@ class ScrubTest(unittest.TestCase):
self.assertFalse(data.entries[0].has_key('updated_parsed'))
self.assertFalse(data.entries[0].summary_detail.has_key('language'))
def test_scrub_type(self):
base = feedparser.parse(feed)
self.assertEqual('F&ouml;o', base.feed.author_detail.name)
config.parser.readfp(StringIO.StringIO(configData))
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
@ -57,3 +67,18 @@ class ScrubTest(unittest.TestCase):
self.assertEqual('text/html', data.entries[0].summary_detail.type)
self.assertEqual('text/html', data.entries[0].content[0].type)
def test_scrub_future(self):
base = feedparser.parse(feed)
self.assertEqual(1, len(base.entries))
self.assertTrue(base.entries[0].has_key('updated'))
config.parser.readfp(StringIO.StringIO(configData))
config.parser.set('testfeed', 'future_dates', 'ignore_date')
data = deepcopy(base)
scrub('testfeed', data)
self.assertFalse(data.entries[0].has_key('updated'))
config.parser.set('testfeed', 'future_dates', 'ignore_entry')
data = deepcopy(base)
scrub('testfeed', data)
self.assertEqual(0, len(data.entries))