From c20acf9944f9ef914ffe9654d32e9fbb14e0631b Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Wed, 22 Nov 2006 12:31:22 -0500 Subject: [PATCH] Hash content to determine if it was modified --- docs/config.html | 11 +++++- planet/config.py | 8 ++-- planet/reconstitute.py | 2 +- planet/spider.py | 85 ++++++++++++++++++------------------------ 4 files changed, 52 insertions(+), 54 deletions(-) diff --git a/docs/config.html b/docs/config.html index b201b26..f992e2e 100644 --- a/docs/config.html +++ b/docs/config.html @@ -100,7 +100,16 @@ use for logging output. Note: this configuration value is processed
Number of items to take from new feeds
spider_threads
The number of threads to use when spidering. When set to 0, the default, - no threads are used and spidering follows the traditional algorithm.
+no threads are used and spidering follows the traditional algorithm. +
spider_threads
+
The number of threads to use when spidering. When set to 0, the default, +no threads are used and spidering follows the traditional algorithm.
+
http_cache_directory
+
If spider_threads is specified, you can also specify a +directory to be used for an additional HTTP cache to front end the Venus +cache. If specified as a relative path, it is evaluated relative to the +cache_directory.
+ diff --git a/planet/config.py b/planet/config.py index 9526f36..da8de60 100644 --- a/planet/config.py +++ b/planet/config.py @@ -285,13 +285,13 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru def http_cache_directory(): if parser.has_option('Planet', 'http_cache_directory'): - parser.get('Planet', 'http_cache_directory') - else: - return os.path.join(cache_directory(), 'sources/http') + os.path.join(cache_directory(), + parser.get('Planet', 'http_cache_directory')) def cache_sources_directory(): if parser.has_option('Planet', 'cache_sources_directory'): - parser.get('Planet', 'cache_sources_directory') + return os.path.join(cache_directory(), + parser.get('Planet', 'cache_sources_directory')) else: return os.path.join(cache_directory(), 'sources') diff --git a/planet/reconstitute.py b/planet/reconstitute.py index 6d7f43d..bf209c7 100644 --- a/planet/reconstitute.py +++ b/planet/reconstitute.py @@ -105,7 +105,7 @@ def links(xentry, entry): if entry.has_key('link'): entry['links'].append({'rel':'alternate', 'href':entry.link}) xdoc = xentry.ownerDocument - for link in entry.links: + for link in entry['links']: if not 'href' in link.keys(): continue xlink = xdoc.createElement('link') xlink.setAttribute('href', link.get('href')) diff --git a/planet/spider.py b/planet/spider.py index 9463fdf..bc22d1f 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -121,36 +121,6 @@ def _is_http_uri(uri): parsed = urlparse.urlparse(uri) return parsed[0] in ['http', 'https'] -def spiderFeed(feed_uri, only_if_new=0): - """ Spider (fetch) a single feed """ - log = planet.logger - - # read cached feed info - sources = config.cache_sources_directory() - if not os.path.exists(sources): - os.makedirs(sources, 0700) - - feed_source = filename(sources, feed_uri) - feed_info = feedparser.parse(feed_source) - if feed_info.feed and only_if_new: - log.info("Feed %s already in cache", feed_uri) - return - if feed_info.feed.get('planet_http_status',None) == '410': - log.info("Feed %s gone", feed_uri) - return - - # read feed itself - modified = None - try: - modified=time.strptime( - feed_info.feed.get('planet_http_last_modified', None)) - except: - pass - data = feedparser.parse(feed_info.feed.get('planet_http_location',feed_uri), - etag=feed_info.feed.get('planet_http_etag',None), modified=modified) - - writeCache(feed_uri, feed_info, data) - def writeCache(feed_uri, feed_info, data): log = planet.logger sources = config.cache_sources_directory() @@ -159,7 +129,8 @@ def writeCache(feed_uri, feed_info, data): if not data.has_key("status"): if data.has_key("entries") and len(data.entries)>0: data.status = 200 - elif data.bozo and data.bozo_exception.__class__.__name__.lower()=='timeout': + elif data.bozo and \ + data.bozo_exception.__class__.__name__.lower()=='timeout': data.status = 408 else: data.status = 500 @@ -210,11 +181,16 @@ def writeCache(feed_uri, feed_info, data): if data.has_key('headers'): if data.has_key('etag') and data.etag: data.feed['planet_http_etag'] = data.etag - log.debug("E-Tag: %s", data.etag) - if data.has_key('modified') and data.modified: + elif data.headers.has_key('etag') and data.headers['etag']: + data.feed['planet_http_etag'] = data.headers['etag'] + + if data.headers.has_key('last-modified'): + data.feed['planet_http_last_modified']=data.headers['last-modified'] + elif data.has_key('modified') and data.modified: data.feed['planet_http_last_modified'] = time.asctime(data.modified) - log.debug("Last Modified: %s", - data.feed['planet_http_last_modified']) + + if data.headers.has_key('-content-hash'): + data.feed['planet_content_hash'] = data.headers['-content-hash'] # capture feed and data from the planet configuration file if data.version: @@ -337,13 +313,11 @@ def writeCache(feed_uri, feed_info, data): xdoc.unlink() def httpThread(thread_index, input_queue, output_queue, log): - from Queue import Empty - import httplib2 + import httplib2, md5 from socket import gaierror, error from httplib import BadStatusLine - http_cache = config.http_cache_directory() - h = httplib2.Http(http_cache) + h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) @@ -363,10 +337,26 @@ def httpThread(thread_index, input_queue, output_queue, log): log.info("unable to map %s to a URI", uri) idna = uri + # cache control headers + headers = {} + if feed_info.feed.has_key('planet_http_etag'): + headers['If-None-Match'] = feed_info.feed['planet_http_etag'] + if feed_info.feed.has_key('planet_http_last_modified'): + headers['If-Modified-Since'] = \ + feed_info.feed['planet_http_last_modified'] + # issue request - (resp, content) = h.request(idna) - if resp.status == 200 and resp.fromcache: - resp.status = 304 + (resp, content) = h.request(idna, 'GET', headers=headers) + + # unchanged detection + resp['-content-hash'] = md5.new(content or '').hexdigest() + if resp.status == 200: + if resp.fromcache: + resp.status = 304 + elif feed_info.feed.has_key('planet_content_hash') and \ + feed_info.feed['planet_content_hash'] == \ + resp['-content-hash']: + resp.status = 304 # build a file-like object feed = StringIO(content) @@ -385,8 +375,7 @@ def httpThread(thread_index, input_queue, output_queue, log): feed.headers['status'] = '408' log.warn("Timeout in thread-%d", thread_index) else: - log.error("HTTP Error: %s in thread-%d", - str(e), thread_index) + log.error("HTTP Error: %s in thread-%d", str(e), thread_index) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() @@ -428,7 +417,7 @@ def spiderPlanet(only_if_new = False): threads = {} if int(config.spider_threads()): http_cache = config.http_cache_directory() - if not os.path.exists(http_cache): + if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache, 0700) # Start all the worker threads @@ -484,9 +473,9 @@ def spiderPlanet(only_if_new = False): data = feedparser.parse(feed, **options) else: - data = feedparser.FeedParserDict({'version':None, - 'headers':feed.headers, 'entries': [], - 'status': int(feed.headers.status)}) + data = feedparser.FeedParserDict({'version': None, + 'headers': feed.headers, 'entries': [], 'feed': {}, + 'bozo': 0, 'status': int(feed.headers.status)}) writeCache(uri, feed_info, data)