From c20acf9944f9ef914ffe9654d32e9fbb14e0631b Mon Sep 17 00:00:00 2001
From: Sam Ruby <rubys@intertwingly.net>
Date: Wed, 22 Nov 2006 12:31:22 -0500
Subject: [PATCH] Hash content to determine if it was modified

---
 docs/config.html       | 11 +++++-
 planet/config.py       |  8 ++--
 planet/reconstitute.py |  2 +-
 planet/spider.py       | 85 ++++++++++++++++++------------------------
 4 files changed, 52 insertions(+), 54 deletions(-)
diff --git a/docs/config.html b/docs/config.html
index b201b26..f992e2e 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -100,7 +100,16 @@ use for logging output.  Note: this configuration value is processed
 <dd>Number of items to take from new feeds</dd>
 <dt><ins>spider_threads</ins></dt>
 <dd>The number of threads to use when spidering. When set to 0, the default, 
-   no threads are used and spidering follows the traditional algorithm.</dd>
+no threads are used and spidering follows the traditional algorithm.</dd>
+<dt><ins>spider_threads</ins></dt>
+<dd>The number of threads to use when spidering. When set to 0, the default, 
+no threads are used and spidering follows the traditional algorithm.</dd>
+<dt><ins>http_cache_directory</ins></dt>
+<dd>If <code>spider_threads</code> is specified, you can also specify a
+directory to be used for an additional HTTP cache to front end the Venus
+cache.  If specified as a relative path, it is evaluated relative to the
+<code>cache_directory</code>.</dd>
+<code>
 </dl>
 </blockquote>
 
diff --git a/planet/config.py b/planet/config.py
index 9526f36..da8de60 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -285,13 +285,13 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
 
 def http_cache_directory():
     if parser.has_option('Planet', 'http_cache_directory'):
-        parser.get('Planet', 'http_cache_directory')
-    else:
-        return os.path.join(cache_directory(), 'sources/http')
+        os.path.join(cache_directory(), 
+            parser.get('Planet', 'http_cache_directory'))
 
 def cache_sources_directory():
     if parser.has_option('Planet', 'cache_sources_directory'):
-        parser.get('Planet', 'cache_sources_directory')
+        return os.path.join(cache_directory(),
+            parser.get('Planet', 'cache_sources_directory'))
     else:
         return os.path.join(cache_directory(), 'sources')
 
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index 6d7f43d..bf209c7 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -105,7 +105,7 @@ def links(xentry, entry):
        if entry.has_key('link'):
          entry['links'].append({'rel':'alternate', 'href':entry.link}) 
     xdoc = xentry.ownerDocument
-    for link in entry.links:
+    for link in entry['links']:
         if not 'href' in link.keys(): continue
         xlink = xdoc.createElement('link')
         xlink.setAttribute('href', link.get('href'))
diff --git a/planet/spider.py b/planet/spider.py
index 9463fdf..bc22d1f 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -121,36 +121,6 @@ def _is_http_uri(uri):
     parsed = urlparse.urlparse(uri)
     return parsed[0] in ['http', 'https']
 
-def spiderFeed(feed_uri, only_if_new=0):
-    """ Spider (fetch) a single feed """
-    log = planet.logger
-
-    # read cached feed info
-    sources = config.cache_sources_directory()
-    if not os.path.exists(sources):
-        os.makedirs(sources, 0700)
-
-    feed_source = filename(sources, feed_uri)
-    feed_info = feedparser.parse(feed_source)
-    if feed_info.feed and only_if_new:
-        log.info("Feed %s already in cache", feed_uri)
-        return
-    if feed_info.feed.get('planet_http_status',None) == '410':
-        log.info("Feed %s gone", feed_uri)
-        return
-
-    # read feed itself
-    modified = None
-    try:
-        modified=time.strptime(
-            feed_info.feed.get('planet_http_last_modified', None))
-    except:
-        pass
-    data = feedparser.parse(feed_info.feed.get('planet_http_location',feed_uri),
-        etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
-
-    writeCache(feed_uri, feed_info, data)
-
 def writeCache(feed_uri, feed_info, data):
     log = planet.logger
     sources = config.cache_sources_directory()
@@ -159,7 +129,8 @@ def writeCache(feed_uri, feed_info, data):
     if not data.has_key("status"):
         if data.has_key("entries") and len(data.entries)>0:
             data.status = 200
-        elif data.bozo and data.bozo_exception.__class__.__name__.lower()=='timeout':
+        elif data.bozo and \
+            data.bozo_exception.__class__.__name__.lower()=='timeout':
             data.status = 408
         else:
             data.status = 500
@@ -210,11 +181,16 @@ def writeCache(feed_uri, feed_info, data):
     if data.has_key('headers'):
         if data.has_key('etag') and data.etag:
             data.feed['planet_http_etag'] = data.etag
-            log.debug("E-Tag: %s", data.etag)
-        if data.has_key('modified') and data.modified:
+        elif data.headers.has_key('etag') and data.headers['etag']:
+            data.feed['planet_http_etag'] =  data.headers['etag']
+
+        if data.headers.has_key('last-modified'):
+            data.feed['planet_http_last_modified']=data.headers['last-modified']
+        elif data.has_key('modified') and data.modified:
             data.feed['planet_http_last_modified'] = time.asctime(data.modified)
-            log.debug("Last Modified: %s",
-                data.feed['planet_http_last_modified'])
+
+        if data.headers.has_key('-content-hash'):
+            data.feed['planet_content_hash'] = data.headers['-content-hash']
 
     # capture feed and data from the planet configuration file
     if data.version:
@@ -337,13 +313,11 @@ def writeCache(feed_uri, feed_info, data):
     xdoc.unlink()
 
 def httpThread(thread_index, input_queue, output_queue, log):
-    from Queue import Empty
-    import httplib2
+    import httplib2, md5
     from socket import gaierror, error 
     from httplib import BadStatusLine
 
-    http_cache = config.http_cache_directory()
-    h = httplib2.Http(http_cache)
+    h = httplib2.Http(config.http_cache_directory())
     uri, feed_info = input_queue.get(block=True)
     while uri:
         log.info("Fetching %s via %d", uri, thread_index)
@@ -363,10 +337,26 @@ def httpThread(thread_index, input_queue, output_queue, log):
                 log.info("unable to map %s to a URI", uri)
                 idna = uri
 
+            # cache control headers
+            headers = {}
+            if feed_info.feed.has_key('planet_http_etag'):
+                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
+            if feed_info.feed.has_key('planet_http_last_modified'):
+                headers['If-Modified-Since'] = \
+                    feed_info.feed['planet_http_last_modified']
+
             # issue request
-            (resp, content) = h.request(idna)
-            if resp.status == 200 and resp.fromcache:
-                resp.status = 304
+            (resp, content) = h.request(idna, 'GET', headers=headers)
+
+            # unchanged detection
+            resp['-content-hash'] = md5.new(content or '').hexdigest()
+            if resp.status == 200:
+                if resp.fromcache:
+                    resp.status = 304
+                elif feed_info.feed.has_key('planet_content_hash') and \
+                    feed_info.feed['planet_content_hash'] == \
+                    resp['-content-hash']:
+                    resp.status = 304
 
             # build a file-like object
             feed = StringIO(content) 
@@ -385,8 +375,7 @@ def httpThread(thread_index, input_queue, output_queue, log):
                 feed.headers['status'] = '408'
                 log.warn("Timeout in thread-%d", thread_index)
             else:
-                log.error("HTTP Error: %s in thread-%d",
-                str(e), thread_index)
+                log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
         except Exception, e:
             import sys, traceback
             type, value, tb = sys.exc_info()
@@ -428,7 +417,7 @@ def spiderPlanet(only_if_new = False):
     threads = {}
     if int(config.spider_threads()):
         http_cache = config.http_cache_directory()
-        if not os.path.exists(http_cache):
+        if http_cache and not os.path.exists(http_cache):
             os.makedirs(http_cache, 0700)
 
         # Start all the worker threads
@@ -484,9 +473,9 @@ def spiderPlanet(only_if_new = False):
 
                     data = feedparser.parse(feed, **options)
                 else:
-                    data = feedparser.FeedParserDict({'version':None,
-                        'headers':feed.headers, 'entries': [],
-                        'status': int(feed.headers.status)})
+                    data = feedparser.FeedParserDict({'version': None,
+                        'headers': feed.headers, 'entries': [], 'feed': {},
+                        'bozo': 0, 'status': int(feed.headers.status)})
 
                 writeCache(uri, feed_info, data)