From 6cc797ce0ae9912dc7f6ee89033f90502982893c Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 7 Dec 2006 18:31:45 -0500 Subject: [PATCH] added a new config option: future_dates --- docs/config.html | 6 +-- docs/normalization.html | 12 +++++- planet/config.py | 1 + planet/scrub.py | 94 +++++++++++++++++++++++++++++++++++++++++ planet/spider.py | 66 ++--------------------------- tests/test_scrub.py | 53 +++++++++++++++++------ 6 files changed, 149 insertions(+), 83 deletions(-) create mode 100644 planet/scrub.py diff --git a/docs/config.html b/docs/config.html index f992e2e..9491a29 100644 --- a/docs/config.html +++ b/docs/config.html @@ -101,16 +101,14 @@ use for logging output. Note: this configuration value is processed
spider_threads
The number of threads to use when spidering. When set to 0, the default, no threads are used and spidering follows the traditional algorithm.
-
spider_threads
-
The number of threads to use when spidering. When set to 0, the default, -no threads are used and spidering follows the traditional algorithm.
http_cache_directory
If spider_threads is specified, you can also specify a directory to be used for an additional HTTP cache to front end the Venus cache. If specified as a relative path, it is evaluated relative to the cache_directory.
- +

Additional options can be found in +normalization level overrides.

[DEFAULT]

diff --git a/docs/normalization.html b/docs/normalization.html index de73812..08465f5 100644 --- a/docs/normalization.html +++ b/docs/normalization.html @@ -69,8 +69,9 @@ are converted into
  • content
  • If no updated dates are found in an entry, or if the dates found -are in the future, the current time is substituted.

    +updated.html">updated dates are found in an entry, the updated date from +the feed is used. If no updated date is found in either the feed or +the entry, the current time is substituted.

    Overrides

    All of the above describes what Venus does automatically, either directly or through its dependencies. There are a number of errors which can not @@ -87,6 +88,13 @@ case of feeds where the id, updated or attributes on these elements.

  • name_type does something similar for author names
  • +
  • future_dates allows you to specify how to deal with dates which are in the future. +
      +
    • ignore_date will cause the date to be ignored (and will therefore default to the time the entry was first seen) until the feed is updated and the time indicated is past, at which point the entry will be updated with the new date.
    • +
    • ignore_entry will cause the entire entry containing the future date to be ignored until the date is past.
    • +
    • Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.
    • +
    +
  • diff --git a/planet/config.py b/planet/config.py index da8de60..cc12ecc 100644 --- a/planet/config.py +++ b/planet/config.py @@ -118,6 +118,7 @@ def __init__(): define_tmpl('title_type', '') define_tmpl('summary_type', '') define_tmpl('content_type', '') + define_tmpl('future_dates', 'keep') def load(config_file): """ initialize and load a configuration""" diff --git a/planet/scrub.py b/planet/scrub.py new file mode 100644 index 0000000..42d75ae --- /dev/null +++ b/planet/scrub.py @@ -0,0 +1,94 @@ +""" +Process a set of configuration defined sanitations on a given feed. +""" + +# Standard library modules +import time +# Planet modules +import planet, config, shell + +type_map = {'text': 'text/plain', 'html': 'text/html', + 'xhtml': 'application/xhtml+xml'} + +def scrub(feed_uri, data): + + # some data is not trustworthy + for tag in config.ignore_in_feed(feed_uri).split(): + if tag.find('lang')>=0: tag='language' + if data.feed.has_key(tag): del data.feed[tag] + for entry in data.entries: + if entry.has_key(tag): del entry[tag] + if entry.has_key(tag + "_detail"): del entry[tag + "_detail"] + if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"] + for key in entry.keys(): + if not key.endswith('_detail'): continue + for detail in entry[key].copy(): + if detail == tag: del entry[key][detail] + + # adjust title types + if config.title_type(feed_uri): + title_type = config.title_type(feed_uri) + title_type = type_map.get(title_type, title_type) + for entry in data.entries: + if entry.has_key('title_detail'): + entry.title_detail['type'] = title_type + + # adjust summary types + if config.summary_type(feed_uri): + summary_type = config.summary_type(feed_uri) + summary_type = type_map.get(summary_type, summary_type) + for entry in data.entries: + if entry.has_key('summary_detail'): + entry.summary_detail['type'] = summary_type + + # adjust content types + if config.content_type(feed_uri): + content_type = config.content_type(feed_uri) + content_type = type_map.get(content_type, content_type) + for entry in data.entries: + if entry.has_key('content'): + entry.content[0]['type'] = content_type + + # some people put html in author names + if config.name_type(feed_uri).find('html')>=0: + from shell.tmpl import stripHtml + if data.feed.has_key('author_detail') and \ + data.feed.author_detail.has_key('name'): + data.feed.author_detail['name'] = \ + str(stripHtml(data.feed.author_detail.name)) + for entry in data.entries: + if entry.has_key('author_detail') and \ + entry.author_detail.has_key('name'): + entry.author_detail['name'] = \ + str(stripHtml(entry.author_detail.name)) + if entry.has_key('source'): + source = entry.source + if source.has_key('author_detail') and \ + source.author_detail.has_key('name'): + source.author_detail['name'] = \ + str(stripHtml(source.author_detail.name)) + + # handle dates in the future + future_dates = config.future_dates(feed_uri).lower() + if future_dates == 'ignore_date': + now = time.gmtime() + if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']: + if data.feed['updated_parsed'] > now: del data.feed['updated_parsed'] + for entry in data.entries: + if entry.has_key('published_parsed') and entry['published_parsed']: + if entry['published_parsed'] > now: + del entry['published_parsed'] + del entry['published'] + if entry.has_key('updated_parsed') and entry['updated_parsed']: + if entry['updated_parsed'] > now: + del entry['updated_parsed'] + del entry['updated'] + elif future_dates == 'ignore_entry': + now = time.time() + if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']: + if data.feed['updated_parsed'] > now: del data.feed['updated_parsed'] + data.entries = [entry for entry in data.entries if + (not entry.has_key('published_parsed') or not entry['published_parsed'] + or entry['published_parsed'] <= now) and + (not entry.has_key('updated_parsed') or not entry['updated_parsed'] + or entry['updated_parsed'] <= now)] diff --git a/planet/spider.py b/planet/spider.py index bc22d1f..6630be9 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -7,7 +7,7 @@ and write each as a set of entries in a cache directory. import time, calendar, re, os, urlparse from xml.dom import minidom # Planet modules -import planet, config, feedparser, reconstitute, shell, socket +import planet, config, feedparser, reconstitute, shell, socket, scrub from StringIO import StringIO # Regular expressions to sanitise cache filenames @@ -57,66 +57,6 @@ def write(xdoc, out): file.write(xdoc) file.close() -type_map = {'text': 'text/plain', 'html': 'text/html', - 'xhtml': 'application/xhtml+xml'} - -def scrub(feed, data): - - # some data is not trustworthy - for tag in config.ignore_in_feed(feed).split(): - if tag.find('lang')>=0: tag='language' - if data.feed.has_key(tag): del data.feed[tag] - for entry in data.entries: - if entry.has_key(tag): del entry[tag] - if entry.has_key(tag + "_detail"): del entry[tag + "_detail"] - if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"] - for key in entry.keys(): - if not key.endswith('_detail'): continue - for detail in entry[key].copy(): - if detail == tag: del entry[key][detail] - - # adjust title types - if config.title_type(feed): - title_type = config.title_type(feed) - title_type = type_map.get(title_type, title_type) - for entry in data.entries: - if entry.has_key('title_detail'): - entry.title_detail['type'] = title_type - - # adjust summary types - if config.summary_type(feed): - summary_type = config.summary_type(feed) - summary_type = type_map.get(summary_type, summary_type) - for entry in data.entries: - if entry.has_key('summary_detail'): - entry.summary_detail['type'] = summary_type - - # adjust content types - if config.content_type(feed): - content_type = config.content_type(feed) - content_type = type_map.get(content_type, content_type) - for entry in data.entries: - if entry.has_key('content'): - entry.content[0]['type'] = content_type - - # some people put html in author names - if config.name_type(feed).find('html')>=0: - from planet.shell.tmpl import stripHtml - if data.feed.has_key('author_detail') and \ - data.feed.author_detail.has_key('name'): - data.feed.author_detail['name'] = \ - str(stripHtml(data.feed.author_detail.name)) - for entry in data.entries: - if entry.has_key('author_detail') and \ - entry.author_detail.has_key('name'): - entry.author_detail['name'] = \ - str(stripHtml(entry.author_detail.name)) - if entry.has_key('source'): - source = entry.source - if source.has_key('author_detail') and \ - source.author_detail.has_key('name'): - source.author_detail['name'] = \ - str(stripHtml(source.author_detail.name)) def _is_http_uri(uri): parsed = urlparse.urlparse(uri) return parsed[0] in ['http', 'https'] @@ -209,7 +149,7 @@ def writeCache(feed_uri, feed_info, data): data.feed['planet_'+name] = value # perform user configured scrub operations on the data - scrub(feed_uri, data) + scrub.scrub(feed_uri, data) from planet import idindex global index @@ -244,7 +184,7 @@ def writeCache(feed_uri, feed_info, data): mtime = calendar.timegm(data.feed.updated_parsed) except: pass - if not mtime or mtime > time.time(): mtime = time.time() + if not mtime: mtime = time.time() entry['updated_parsed'] = time.gmtime(mtime) # apply any filters diff --git a/tests/test_scrub.py b/tests/test_scrub.py index 7d9d1b0..17874a3 100644 --- a/tests/test_scrub.py +++ b/tests/test_scrub.py @@ -1,7 +1,8 @@ #!/usr/bin/env python -import unittest, StringIO -from planet.spider import scrub +import unittest, StringIO, time +from copy import deepcopy +from planet.scrub import scrub from planet import feedparser, config feed = ''' @@ -10,7 +11,7 @@ feed = ''' ignoreme F&ouml;o - 2000-01-01T00:00:00Z + %d-12-31T23:59:59Z F&ouml;o F&ouml;o F&ouml;o @@ -19,11 +20,10 @@ feed = ''' -''' +''' % (time.gmtime()[0] + 1) configData = ''' [testfeed] -ignore_in_feed = id updated xml:lang name_type = html title_type = html summary_type = html @@ -32,16 +32,17 @@ content_type = html class ScrubTest(unittest.TestCase): - def test_scrub(self): - data = feedparser.parse(feed) + def test_scrub_ignore(self): + base = feedparser.parse(feed) + + self.assertTrue(base.entries[0].has_key('id')) + self.assertTrue(base.entries[0].has_key('updated')) + self.assertTrue(base.entries[0].has_key('updated_parsed')) + self.assertTrue(base.entries[0].summary_detail.has_key('language')) + config.parser.readfp(StringIO.StringIO(configData)) - - self.assertEqual('Föo', data.feed.author_detail.name) - self.assertTrue(data.entries[0].has_key('id')) - self.assertTrue(data.entries[0].has_key('updated')) - self.assertTrue(data.entries[0].has_key('updated_parsed')) - self.assertTrue(data.entries[0].summary_detail.has_key('language')) - + config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang') + data = deepcopy(base) scrub('testfeed', data) self.assertFalse(data.entries[0].has_key('id')) @@ -49,6 +50,15 @@ class ScrubTest(unittest.TestCase): self.assertFalse(data.entries[0].has_key('updated_parsed')) self.assertFalse(data.entries[0].summary_detail.has_key('language')) + def test_scrub_type(self): + base = feedparser.parse(feed) + + self.assertEqual('Föo', base.feed.author_detail.name) + + config.parser.readfp(StringIO.StringIO(configData)) + data = deepcopy(base) + scrub('testfeed', data) + self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name) self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name) self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name) @@ -57,3 +67,18 @@ class ScrubTest(unittest.TestCase): self.assertEqual('text/html', data.entries[0].summary_detail.type) self.assertEqual('text/html', data.entries[0].content[0].type) + def test_scrub_future(self): + base = feedparser.parse(feed) + self.assertEqual(1, len(base.entries)) + self.assertTrue(base.entries[0].has_key('updated')) + + config.parser.readfp(StringIO.StringIO(configData)) + config.parser.set('testfeed', 'future_dates', 'ignore_date') + data = deepcopy(base) + scrub('testfeed', data) + self.assertFalse(data.entries[0].has_key('updated')) + + config.parser.set('testfeed', 'future_dates', 'ignore_entry') + data = deepcopy(base) + scrub('testfeed', data) + self.assertEqual(0, len(data.entries))