diff --git a/planet/feedparser.py b/planet/feedparser.py index 2bcb1a7..e35e247 100755 --- a/planet/feedparser.py +++ b/planet/feedparser.py @@ -342,7 +342,11 @@ _cp1252 = { _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') def _urljoin(base, uri): uri = _urifixer.sub(r'\1\3', uri) - return urlparse.urljoin(base, uri) + try: + return urlparse.urljoin(base, uri) + except: + uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) + return urlparse.urljoin(base, uri) class _FeedParserMixin: namespaces = {'': '', diff --git a/planet/reconstitute.py b/planet/reconstitute.py index 2badc50..196d691 100644 --- a/planet/reconstitute.py +++ b/planet/reconstitute.py @@ -188,14 +188,14 @@ def source(xsource, source, bozo, format): date(xsource, 'updated', source.get('updated_parsed',time.gmtime())) + if format: source['planet_format'] = format + if not bozo == None: source['planet_bozo'] = bozo and 'true' or 'false' + # propagate planet inserted information for key, value in source.items(): if key.startswith('planet_'): createTextElement(xsource, key.replace('_',':',1), value) - createTextElement(xsource, 'planet:bozo', bozo and 'true' or 'false') - createTextElement(xsource, 'planet:format', format) - def reconstitute(feed, entry): """ create an entry document from a parsed feed """ xdoc=minidom.parseString('\n') diff --git a/planet/shell/py.py b/planet/shell/py.py index f4476fa..8f365f6 100644 --- a/planet/shell/py.py +++ b/planet/shell/py.py @@ -1,4 +1,5 @@ from subprocess import Popen, PIPE +import sys def run(script, doc, output_file=None, options={}): """ process an Python script """ @@ -10,7 +11,7 @@ def run(script, doc, output_file=None, options={}): options = sum([['--'+key, value] for key,value in options.items()], []) - proc = Popen(['python', script] + options, + proc = Popen([sys.executable, script] + options, stdin=PIPE, stdout=out, stderr=PIPE) stdout, stderr = proc.communicate(doc) diff --git a/planet/spider.py b/planet/spider.py index 009d1d0..d82fabc 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -99,6 +99,7 @@ def scrub(feed, data): def spiderFeed(feed): """ Spider (fetch) a single feed """ + log = planet.logger # read cached feed info sources = config.cache_sources_directory() @@ -116,9 +117,6 @@ def spiderFeed(feed): data = feedparser.parse(feed_info.feed.get('planet_http_location',feed), etag=feed_info.feed.get('planet_http_etag',None), modified=modified) - # if read failed, retain cached information - if not data.version and feed_info.version: data.feed = feed_info.feed - # capture http status if not data.has_key("status"): if data.has_key("entries") and len(data.entries)>0: @@ -127,29 +125,44 @@ def spiderFeed(feed): data.status = 408 else: data.status = 500 - data.feed['planet_http_status'] = str(data.status) + + activity_horizon = \ + time.gmtime(time.time()-86400*config.activity_threshold(feed)) # process based on the HTTP status code - log = planet.logger if data.status == 200 and data.has_key("url"): data.feed['planet_http_location'] = data.url elif data.status == 301 and data.has_key("entries") and len(data.entries)>0: log.warning("Feed has moved from <%s> to <%s>", feed, data.url) data.feed['planet_http_location'] = data.url elif data.status == 304: - return log.info("Feed %s unchanged", feed) - elif data.status >= 400: - feed_info.update(data.feed) - data.feed = feed_info - if data.status == 410: - log.info("Feed %s gone", feed) - elif data.status == 408: - log.warning("Feed %s timed out", feed) + log.info("Feed %s unchanged", feed) + + if not feed_info.feed.has_key('planet_message'): + if feed_info.feed.has_key('planet_updated'): + updated = feed_info.feed.planet_updated + if feedparser._parse_date_iso8601(updated) >= activity_horizon: + return else: - log.error("Error %d while updating feed %s", data.status, feed) + if feed_info.feed.planet_message.startswith("no activity in"): + return + del feed_info.feed['planet_message'] + + elif data.status == 410: + log.info("Feed %s gone", feed) + elif data.status == 408: + log.warning("Feed %s timed out", feed) + elif data.status >= 400: + log.error("Error %d while updating feed %s", data.status, feed) else: log.info("Updating feed %s", feed) + # if read failed, retain cached information + if not data.version and feed_info.version: + data.feed = feed_info.feed + data.bozo = feed_info.feed.get('planet_bozo','true') == 'true' + data.feed['planet_http_status'] = str(data.status) + # capture etag and last-modified information if data.has_key('headers'): if data.has_key('etag') and data.etag: @@ -213,29 +226,36 @@ def spiderFeed(feed): # identify inactive feeds if config.activity_threshold(feed): - activity_horizon = \ - time.gmtime(time.time()-86400*config.activity_threshold(feed)) updated = [entry.updated_parsed for entry in data.entries if entry.has_key('updated_parsed')] updated.sort() + + if updated: + data.feed['planet_updated'] = \ + time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1]) + elif data.feed.has_key('planet_updated'): + updated = [feedparser._parse_date_iso8601(data.feed.planet_updated)] + if not updated or updated[-1] < activity_horizon: msg = "no activity in %d days" % config.activity_threshold(feed) log.info(msg) data.feed['planet_message'] = msg # report channel level errors - if data.status == 403: - data.feed['planet_message'] = "403: forbidden" + if data.status == 226: + if data.feed.has_key('planet_message'): del data.feed['planet_message'] + elif data.status == 403: + data.feed['planet_message'] = "403: forbidden" elif data.status == 404: - data.feed['planet_message'] = "404: not found" + data.feed['planet_message'] = "404: not found" elif data.status == 408: - data.feed['planet_message'] = "408: request timeout" + data.feed['planet_message'] = "408: request timeout" elif data.status == 410: - data.feed['planet_message'] = "410: gone" + data.feed['planet_message'] = "410: gone" elif data.status == 500: - data.feed['planet_message'] = "internal server error" + data.feed['planet_message'] = "internal server error" elif data.status >= 400: - data.feed['planet_message'] = "http status %s" % data.status + data.feed['planet_message'] = "http status %s" % data.status # write the feed info to the cache if not os.path.exists(sources): os.makedirs(sources) @@ -251,4 +271,12 @@ def spiderPlanet(): planet.setTimeout(config.feed_timeout()) for feed in config.subscriptions(): - spiderFeed(feed) + try: + spiderFeed(feed) + except Exception,e: + import sys, traceback + type, value, tb = sys.exc_info() + log.error('Error processing %s', feed) + for line in (traceback.format_exception_only(type, value) + + traceback.format_tb(tb)): + log.error(line.rstrip()) diff --git a/tests/test_filters.py b/tests/test_filters.py index f979946..e8c5633 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -78,11 +78,19 @@ class FilterTests(unittest.TestCase): try: from subprocess import Popen, PIPE + sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE) sed.communicate() if sed.returncode != 0: logger.warn("sed is not available => can't test stripAd_yahoo") del FilterTests.test_stripAd_yahoo + + try: + import libxml2 + except: + logger.warn("libxml2 is not available => can't test xpath_sifter") + del FilterTests.test_xpath_filter + except ImportError: logger.warn("Popen is not available => can't test filters") for method in dir(FilterTests):