From 14b3ccf07e37beab081889838d1abb4d3211e641 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Thu, 24 Aug 2006 06:45:23 -0400 Subject: [PATCH] Etag and last-modified support --- planet/spider.py | 59 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/planet/spider.py b/planet/spider.py index c06c498..7fb49bb 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -45,8 +45,61 @@ def write(xdoc, out): def spiderFeed(feed): """ Spider (fetch) a single feed """ - data = feedparser.parse(feed) - if not data.feed: return + + # read cached feed info + sources = config.cache_sources_directory() + feed_source = filename(sources, feed) + feed_info = feedparser.parse(feed_source) + if feed_info.feed.get('planet_http_status',None) == '410': return + + # read feed itself + modified = None + try: + modified=time.strptime( + feed_info.feed.get('planet_http_last_modified', None)) + except: + pass + data = feedparser.parse(feed_info.feed.get('planet_http_location',feed), + etag=feed_info.feed.get('planet_http_etag',None), modified=modified) + + # capture http status + if not data.has_key("status"): + if data.has_key("entries") and len(data.entries)>0: + data.status = 200 + elif data.bozo and data.bozo_exception.__class__.__name__=='Timeout': + data.status = 408 + else: + data.status = 500 + data.feed['planet_http_status'] = str(data.status) + + # process based on the HTTP status code + log = planet.logger + if data.status == 301 and data.has_key("entries") and len(data.entries)>0: + log.warning("Feed has moved from <%s> to <%s>", feed, data.url) + data.feed['planet_http_location'] = data.url + elif data.status == 304: + return log.info("Feed %s unchanged", feed) + elif data.status >= 400: + feed_info.update(data.feed) + data.feed = feed_info + if data.status == 410: + log.info("Feed %s gone", feed) + elif data.status == 408: + log.warning("Feed %s timed out", feed) + else: + log.error("Error %d while updating feed %s", data.status, feed) + else: + log.info("Updating feed %s", feed) + + # capture etag and last-modified information + if data.has_key('headers'): + if data.has_key('etag') and data.etag: + data.feed['planet_http_etag'] = data.etag + log.debug("E-Tag: %s", data.etag) + if data.has_key('modified') and data.modified: + data.feed['planet_http_last_modified'] = time.asctime(data.modified) + log.debug("Last Modified: %s", + data.feed['planet_http_last_modified']) # capture feed and data from the planet configuration file if not data.feed.has_key('links'): data.feed['links'] = list() @@ -59,7 +112,6 @@ def spiderFeed(feed): data.feed['planet_'+name] = value # write the feed info to the cache - sources = config.cache_sources_directory() if not os.path.exists(sources): os.makedirs(sources) xdoc=minidom.parseString('''\n''' % planet.xmlns) @@ -96,5 +148,4 @@ def spiderPlanet(configFile): planet.setTimeout(config.feed_timeout()) for feed in config.feeds(): - log.info("Updating feed %s", feed) spiderFeed(feed)