From cacb8ffeb8292923d61ebe6a619f98dab1d4c124 Mon Sep 17 00:00:00 2001 From: Sam Ruby Date: Sun, 1 Oct 2006 05:14:29 -0400 Subject: [PATCH] Numereous stability fixes: recover from HTTP and FeedParser errors --- planet/feedparser.py | 6 ++++- planet/reconstitute.py | 6 ++--- planet/spider.py | 50 ++++++++++++++++++++++++++++++------------ tests/test_filters.py | 8 +++++++ 4 files changed, 52 insertions(+), 18 deletions(-) diff --git a/planet/feedparser.py b/planet/feedparser.py index 2bcb1a7..e35e247 100755 --- a/planet/feedparser.py +++ b/planet/feedparser.py @@ -342,7 +342,11 @@ _cp1252 = { _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') def _urljoin(base, uri): uri = _urifixer.sub(r'\1\3', uri) - return urlparse.urljoin(base, uri) + try: + return urlparse.urljoin(base, uri) + except: + uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) + return urlparse.urljoin(base, uri) class _FeedParserMixin: namespaces = {'': '', diff --git a/planet/reconstitute.py b/planet/reconstitute.py index 2badc50..196d691 100644 --- a/planet/reconstitute.py +++ b/planet/reconstitute.py @@ -188,14 +188,14 @@ def source(xsource, source, bozo, format): date(xsource, 'updated', source.get('updated_parsed',time.gmtime())) + if format: source['planet_format'] = format + if not bozo == None: source['planet_bozo'] = bozo and 'true' or 'false' + # propagate planet inserted information for key, value in source.items(): if key.startswith('planet_'): createTextElement(xsource, key.replace('_',':',1), value) - createTextElement(xsource, 'planet:bozo', bozo and 'true' or 'false') - createTextElement(xsource, 'planet:format', format) - def reconstitute(feed, entry): """ create an entry document from a parsed feed """ xdoc=minidom.parseString('\n') diff --git a/planet/spider.py b/planet/spider.py index 2a414fd..d82fabc 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -99,6 +99,7 @@ def scrub(feed, data): def spiderFeed(feed): """ Spider (fetch) a single feed """ + log = planet.logger # read cached feed info sources = config.cache_sources_directory() @@ -125,8 +126,10 @@ def spiderFeed(feed): else: data.status = 500 + activity_horizon = \ + time.gmtime(time.time()-86400*config.activity_threshold(feed)) + # process based on the HTTP status code - log = planet.logger if data.status == 200 and data.has_key("url"): data.feed['planet_http_location'] = data.url elif data.status == 301 and data.has_key("entries") and len(data.entries)>0: @@ -134,8 +137,17 @@ def spiderFeed(feed): data.feed['planet_http_location'] = data.url elif data.status == 304: log.info("Feed %s unchanged", feed) - if not feed_info.feed.has_key('planet_message'): return - del feed_info.feed['planet_message'] + + if not feed_info.feed.has_key('planet_message'): + if feed_info.feed.has_key('planet_updated'): + updated = feed_info.feed.planet_updated + if feedparser._parse_date_iso8601(updated) >= activity_horizon: + return + else: + if feed_info.feed.planet_message.startswith("no activity in"): + return + del feed_info.feed['planet_message'] + elif data.status == 410: log.info("Feed %s gone", feed) elif data.status == 408: @@ -146,7 +158,9 @@ def spiderFeed(feed): log.info("Updating feed %s", feed) # if read failed, retain cached information - if not data.version and feed_info.version: data.feed = feed_info.feed + if not data.version and feed_info.version: + data.feed = feed_info.feed + data.bozo = feed_info.feed.get('planet_bozo','true') == 'true' data.feed['planet_http_status'] = str(data.status) # capture etag and last-modified information @@ -212,8 +226,6 @@ def spiderFeed(feed): # identify inactive feeds if config.activity_threshold(feed): - activity_horizon = \ - time.gmtime(time.time()-86400*config.activity_threshold(feed)) updated = [entry.updated_parsed for entry in data.entries if entry.has_key('updated_parsed')] updated.sort() @@ -230,18 +242,20 @@ def spiderFeed(feed): data.feed['planet_message'] = msg # report channel level errors - if data.status == 403: - data.feed['planet_message'] = "403: forbidden" + if data.status == 226: + if data.feed.has_key('planet_message'): del data.feed['planet_message'] + elif data.status == 403: + data.feed['planet_message'] = "403: forbidden" elif data.status == 404: - data.feed['planet_message'] = "404: not found" + data.feed['planet_message'] = "404: not found" elif data.status == 408: - data.feed['planet_message'] = "408: request timeout" + data.feed['planet_message'] = "408: request timeout" elif data.status == 410: - data.feed['planet_message'] = "410: gone" + data.feed['planet_message'] = "410: gone" elif data.status == 500: - data.feed['planet_message'] = "internal server error" + data.feed['planet_message'] = "internal server error" elif data.status >= 400: - data.feed['planet_message'] = "http status %s" % data.status + data.feed['planet_message'] = "http status %s" % data.status # write the feed info to the cache if not os.path.exists(sources): os.makedirs(sources) @@ -257,4 +271,12 @@ def spiderPlanet(): planet.setTimeout(config.feed_timeout()) for feed in config.subscriptions(): - spiderFeed(feed) + try: + spiderFeed(feed) + except Exception,e: + import sys, traceback + type, value, tb = sys.exc_info() + log.error('Error processing %s', feed) + for line in (traceback.format_exception_only(type, value) + + traceback.format_tb(tb)): + log.error(line.rstrip()) diff --git a/tests/test_filters.py b/tests/test_filters.py index f979946..e8c5633 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -78,11 +78,19 @@ class FilterTests(unittest.TestCase): try: from subprocess import Popen, PIPE + sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE) sed.communicate() if sed.returncode != 0: logger.warn("sed is not available => can't test stripAd_yahoo") del FilterTests.test_stripAd_yahoo + + try: + import libxml2 + except: + logger.warn("libxml2 is not available => can't test xpath_sifter") + del FilterTests.test_xpath_filter + except ImportError: logger.warn("Popen is not available => can't test filters") for method in dir(FilterTests):