From 6cc797ce0ae9912dc7f6ee89033f90502982893c Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Thu, 7 Dec 2006 18:31:45 -0500
Subject: [PATCH] added a new config option: future_dates

---
 docs/config.html        |  6 +--
 docs/normalization.html | 12 +++++-
 planet/config.py        |  1 +
 planet/scrub.py         | 94 +++++++++++++++++++++++++++++++++++++++++
 planet/spider.py        | 66 ++---------------------------
 tests/test_scrub.py     | 53 +++++++++++++++++------
 6 files changed, 149 insertions(+), 83 deletions(-)
 create mode 100644 planet/scrub.py
diff --git a/docs/config.html b/docs/config.html
index f992e2e..9491a29 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -101,16 +101,14 @@ use for logging output.  Note: this configuration value is processed
 <dt><ins>spider_threads</ins></dt>
 <dd>The number of threads to use when spidering. When set to 0, the default, 
 no threads are used and spidering follows the traditional algorithm.</dd>
-<dt><ins>spider_threads</ins></dt>
-<dd>The number of threads to use when spidering. When set to 0, the default, 
-no threads are used and spidering follows the traditional algorithm.</dd>
 <dt><ins>http_cache_directory</ins></dt>
 <dd>If <code>spider_threads</code> is specified, you can also specify a
 directory to be used for an additional HTTP cache to front end the Venus
 cache.  If specified as a relative path, it is evaluated relative to the
 <code>cache_directory</code>.</dd>
-<code>
 </dl>
+<p>Additional options can be found in
+<a href="normalization.html#overrides">normalization level overrides</a>.</p>
 </blockquote>
 
 <h3 id="default"><code>[DEFAULT]</code></h3>
diff --git a/docs/normalization.html b/docs/normalization.html
index de73812..08465f5 100644
--- a/docs/normalization.html
+++ b/docs/normalization.html
@@ -69,8 +69,9 @@ are converted into
 <li><a href="http://www.feedparser.org/docs/reference-entry-content.html">content</a></li>
 </ul>
 <p>If no <a href="http://www.feedparser.org/docs/reference-feed-
-updated.html">updated</a> dates are found in an entry, or if the dates found
-are in the future, the current time is substituted.</p>
+updated.html">updated</a> dates are found in an entry, the updated date from
+the feed is used.  If no updated date is found in either the feed or
+the entry, the current time is substituted.</p>
 <h3 id="overrides">Overrides</h3>
 <p>All of the above describes what Venus does automatically, either directly
 or through its dependencies.  There are a number of errors which can not
@@ -87,6 +88,13 @@ case of feeds where the <code>id</code>, <code>updated</code> or
 attributes on these elements.</li>
 <li><code>name_type</code> does something similar for
 <a href="http://www.feedparser.org/docs/reference-entry-author_detail.html#reference.entry.author_detail.name">author names</a></li>
+<li><code>future_dates</code> allows you to specify how to deal with dates which are in the future.
+<ul style="margin:0">
+<li><code>ignore_date</code> will cause the date to be ignored (and will therefore default to the time the entry was first seen) until the feed is updated and the time indicated is past, at which point the entry will be updated with the new date.</li>
+<li><code>ignore_entry</code> will cause the entire entry containing the future date to be ignored until the date is past.</li>
+<li>Anything else (i.e.. the default) will leave the date as is, causing the entries that contain these dates sort to the top of the planet until the time passes.</li>
+</ul>
+</li>
 </ul>
 </body>
 </html>
diff --git a/planet/config.py b/planet/config.py
index da8de60..cc12ecc 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -118,6 +118,7 @@ def __init__():
     define_tmpl('title_type', '')
     define_tmpl('summary_type', '')
     define_tmpl('content_type', '')
+    define_tmpl('future_dates', 'keep')
 
 def load(config_file):
     """ initialize and load a configuration"""
diff --git a/planet/scrub.py b/planet/scrub.py
new file mode 100644
index 0000000..42d75ae
--- /dev/null
+++ b/planet/scrub.py
@@ -0,0 +1,94 @@
+"""
+Process a set of configuration defined sanitations on a given feed.
+"""
+
+# Standard library modules
+import time
+# Planet modules
+import planet, config, shell
+
+type_map = {'text': 'text/plain', 'html': 'text/html',
+    'xhtml': 'application/xhtml+xml'}
+
+def scrub(feed_uri, data):
+
+    # some data is not trustworthy
+    for tag in config.ignore_in_feed(feed_uri).split():
+        if tag.find('lang')>=0: tag='language'
+        if data.feed.has_key(tag): del data.feed[tag]
+        for entry in data.entries:
+            if entry.has_key(tag): del entry[tag]
+            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
+            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
+            for key in entry.keys():
+                if not key.endswith('_detail'): continue
+                for detail in entry[key].copy():
+                    if detail == tag: del entry[key][detail]
+
+    # adjust title types
+    if config.title_type(feed_uri):
+        title_type = config.title_type(feed_uri)
+        title_type = type_map.get(title_type, title_type)
+        for entry in data.entries:
+            if entry.has_key('title_detail'):
+                entry.title_detail['type'] = title_type
+
+    # adjust summary types
+    if config.summary_type(feed_uri):
+        summary_type = config.summary_type(feed_uri)
+        summary_type = type_map.get(summary_type, summary_type)
+        for entry in data.entries:
+            if entry.has_key('summary_detail'):
+                entry.summary_detail['type'] = summary_type
+
+    # adjust content types
+    if config.content_type(feed_uri):
+        content_type = config.content_type(feed_uri)
+        content_type = type_map.get(content_type, content_type)
+        for entry in data.entries:
+            if entry.has_key('content'):
+                entry.content[0]['type'] = content_type
+
+    # some people put html in author names
+    if config.name_type(feed_uri).find('html')>=0:
+        from shell.tmpl import stripHtml
+        if data.feed.has_key('author_detail') and \
+            data.feed.author_detail.has_key('name'):
+            data.feed.author_detail['name'] = \
+                str(stripHtml(data.feed.author_detail.name))
+        for entry in data.entries:
+            if entry.has_key('author_detail') and \
+                entry.author_detail.has_key('name'):
+                entry.author_detail['name'] = \
+                    str(stripHtml(entry.author_detail.name))
+            if entry.has_key('source'):
+                source = entry.source
+                if source.has_key('author_detail') and \
+                    source.author_detail.has_key('name'):
+                    source.author_detail['name'] = \
+                        str(stripHtml(source.author_detail.name))
+
+    # handle dates in the future
+    future_dates = config.future_dates(feed_uri).lower()
+    if future_dates == 'ignore_date':
+      now = time.gmtime()
+      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
+        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
+      for entry in data.entries:
+        if entry.has_key('published_parsed') and entry['published_parsed']:
+          if entry['published_parsed'] > now:
+            del entry['published_parsed']
+            del entry['published']
+        if entry.has_key('updated_parsed') and entry['updated_parsed']:
+          if entry['updated_parsed'] > now:
+            del entry['updated_parsed']
+            del entry['updated']
+    elif future_dates == 'ignore_entry':
+      now = time.time()
+      if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
+        if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
+      data.entries = [entry for entry in data.entries if 
+        (not entry.has_key('published_parsed') or not entry['published_parsed']
+          or entry['published_parsed'] <= now) and
+        (not entry.has_key('updated_parsed') or not entry['updated_parsed']
+          or entry['updated_parsed'] <= now)]
diff --git a/planet/spider.py b/planet/spider.py
index bc22d1f..6630be9 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -7,7 +7,7 @@ and write each as a set of entries in a cache directory.
 import time, calendar, re, os, urlparse
 from xml.dom import minidom
 # Planet modules
-import planet, config, feedparser, reconstitute, shell, socket
+import planet, config, feedparser, reconstitute, shell, socket, scrub
 from StringIO import StringIO 
 
 # Regular expressions to sanitise cache filenames
@@ -57,66 +57,6 @@ def write(xdoc, out):
     file.write(xdoc)
     file.close()
 
-type_map = {'text': 'text/plain', 'html': 'text/html',
-    'xhtml': 'application/xhtml+xml'}
-
-def scrub(feed, data):
-
-    # some data is not trustworthy
-    for tag in config.ignore_in_feed(feed).split():
-        if tag.find('lang')>=0: tag='language'
-        if data.feed.has_key(tag): del data.feed[tag]
-        for entry in data.entries:
-            if entry.has_key(tag): del entry[tag]
-            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
-            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
-            for key in entry.keys():
-                if not key.endswith('_detail'): continue
-                for detail in entry[key].copy():
-                    if detail == tag: del entry[key][detail]
-
-    # adjust title types
-    if config.title_type(feed):
-        title_type = config.title_type(feed)
-        title_type = type_map.get(title_type, title_type)
-        for entry in data.entries:
-            if entry.has_key('title_detail'):
-                entry.title_detail['type'] = title_type
-
-    # adjust summary types
-    if config.summary_type(feed):
-        summary_type = config.summary_type(feed)
-        summary_type = type_map.get(summary_type, summary_type)
-        for entry in data.entries:
-            if entry.has_key('summary_detail'):
-                entry.summary_detail['type'] = summary_type
-
-    # adjust content types
-    if config.content_type(feed):
-        content_type = config.content_type(feed)
-        content_type = type_map.get(content_type, content_type)
-        for entry in data.entries:
-            if entry.has_key('content'):
-                entry.content[0]['type'] = content_type
-
-    # some people put html in author names
-    if config.name_type(feed).find('html')>=0:
-        from planet.shell.tmpl import stripHtml
-        if data.feed.has_key('author_detail') and \
-            data.feed.author_detail.has_key('name'):
-            data.feed.author_detail['name'] = \
-                str(stripHtml(data.feed.author_detail.name))
-        for entry in data.entries:
-            if entry.has_key('author_detail') and \
-                entry.author_detail.has_key('name'):
-                entry.author_detail['name'] = \
-                    str(stripHtml(entry.author_detail.name))
-            if entry.has_key('source'):
-                source = entry.source
-                if source.has_key('author_detail') and \
-                    source.author_detail.has_key('name'):
-                    source.author_detail['name'] = \
-                        str(stripHtml(source.author_detail.name))
 def _is_http_uri(uri):
     parsed = urlparse.urlparse(uri)
     return parsed[0] in ['http', 'https']
@@ -209,7 +149,7 @@ def writeCache(feed_uri, feed_info, data):
         data.feed['planet_'+name] = value
 
     # perform user configured scrub operations on the data
-    scrub(feed_uri, data)
+    scrub.scrub(feed_uri, data)
 
     from planet import idindex
     global index
@@ -244,7 +184,7 @@ def writeCache(feed_uri, feed_info, data):
                         mtime = calendar.timegm(data.feed.updated_parsed)
                     except:
                         pass
-        if not mtime or mtime > time.time(): mtime = time.time()
+        if not mtime: mtime = time.time()
         entry['updated_parsed'] = time.gmtime(mtime)
 
         # apply any filters
diff --git a/tests/test_scrub.py b/tests/test_scrub.py
index 7d9d1b0..17874a3 100644
--- a/tests/test_scrub.py
+++ b/tests/test_scrub.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 
-import unittest, StringIO
-from planet.spider import scrub
+import unittest, StringIO, time
+from copy import deepcopy
+from planet.scrub import scrub
 from planet import feedparser, config
 
 feed = '''
@@ -10,7 +11,7 @@ feed = '''
   <entry xml:lang="en">
     <id>ignoreme</id>
     <author><name>F&amp;ouml;o</name></author>
-    <updated>2000-01-01T00:00:00Z</updated>
+    <updated>%d-12-31T23:59:59Z</updated>
     <title>F&amp;ouml;o</title>
     <summary>F&amp;ouml;o</summary>
     <content>F&amp;ouml;o</content>
@@ -19,11 +20,10 @@ feed = '''
     </source>
   </entry>
 </feed>
-'''
+''' % (time.gmtime()[0] + 1)
 
 configData = '''
 [testfeed]
-ignore_in_feed = id updated xml:lang
 name_type = html
 title_type = html
 summary_type = html
@@ -32,16 +32,17 @@ content_type = html
 
 class ScrubTest(unittest.TestCase):
 
-    def test_scrub(self):
-        data = feedparser.parse(feed)
+    def test_scrub_ignore(self):
+        base = feedparser.parse(feed)
+
+        self.assertTrue(base.entries[0].has_key('id'))
+        self.assertTrue(base.entries[0].has_key('updated'))
+        self.assertTrue(base.entries[0].has_key('updated_parsed'))
+        self.assertTrue(base.entries[0].summary_detail.has_key('language'))
+
         config.parser.readfp(StringIO.StringIO(configData))
-
-        self.assertEqual('F&ouml;o', data.feed.author_detail.name)
-        self.assertTrue(data.entries[0].has_key('id'))
-        self.assertTrue(data.entries[0].has_key('updated'))
-        self.assertTrue(data.entries[0].has_key('updated_parsed'))
-        self.assertTrue(data.entries[0].summary_detail.has_key('language'))
-
+        config.parser.set('testfeed', 'ignore_in_feed', 'id updated xml:lang')
+        data = deepcopy(base)
         scrub('testfeed', data)
 
         self.assertFalse(data.entries[0].has_key('id'))
@@ -49,6 +50,15 @@ class ScrubTest(unittest.TestCase):
         self.assertFalse(data.entries[0].has_key('updated_parsed'))
         self.assertFalse(data.entries[0].summary_detail.has_key('language'))
 
+    def test_scrub_type(self):
+        base = feedparser.parse(feed)
+
+        self.assertEqual('F&ouml;o', base.feed.author_detail.name)
+
+        config.parser.readfp(StringIO.StringIO(configData))
+        data = deepcopy(base)
+        scrub('testfeed', data)
+
         self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
         self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
         self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
@@ -57,3 +67,18 @@ class ScrubTest(unittest.TestCase):
         self.assertEqual('text/html', data.entries[0].summary_detail.type)
         self.assertEqual('text/html', data.entries[0].content[0].type)
 
+    def test_scrub_future(self):
+        base = feedparser.parse(feed)
+        self.assertEqual(1, len(base.entries))
+        self.assertTrue(base.entries[0].has_key('updated'))
+
+        config.parser.readfp(StringIO.StringIO(configData))
+        config.parser.set('testfeed', 'future_dates', 'ignore_date')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertFalse(data.entries[0].has_key('updated'))
+
+        config.parser.set('testfeed', 'future_dates', 'ignore_entry')
+        data = deepcopy(base)
+        scrub('testfeed', data)
+        self.assertEqual(0, len(data.entries))