Numereous stability fixes: recover from HTTP and FeedParser errors

This commit is contained in:
Sam Ruby 2006-10-01 05:14:29 -04:00
parent 9dbc6b5dbe
commit cacb8ffeb8
4 changed files with 52 additions and 18 deletions

View File

@ -342,7 +342,11 @@ _cp1252 = {
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
def _urljoin(base, uri): def _urljoin(base, uri):
uri = _urifixer.sub(r'\1\3', uri) uri = _urifixer.sub(r'\1\3', uri)
return urlparse.urljoin(base, uri) try:
return urlparse.urljoin(base, uri)
except:
uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
return urlparse.urljoin(base, uri)
class _FeedParserMixin: class _FeedParserMixin:
namespaces = {'': '', namespaces = {'': '',

View File

@ -188,14 +188,14 @@ def source(xsource, source, bozo, format):
date(xsource, 'updated', source.get('updated_parsed',time.gmtime())) date(xsource, 'updated', source.get('updated_parsed',time.gmtime()))
if format: source['planet_format'] = format
if not bozo == None: source['planet_bozo'] = bozo and 'true' or 'false'
# propagate planet inserted information # propagate planet inserted information
for key, value in source.items(): for key, value in source.items():
if key.startswith('planet_'): if key.startswith('planet_'):
createTextElement(xsource, key.replace('_',':',1), value) createTextElement(xsource, key.replace('_',':',1), value)
createTextElement(xsource, 'planet:bozo', bozo and 'true' or 'false')
createTextElement(xsource, 'planet:format', format)
def reconstitute(feed, entry): def reconstitute(feed, entry):
""" create an entry document from a parsed feed """ """ create an entry document from a parsed feed """
xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n') xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n')

View File

@ -99,6 +99,7 @@ def scrub(feed, data):
def spiderFeed(feed): def spiderFeed(feed):
""" Spider (fetch) a single feed """ """ Spider (fetch) a single feed """
log = planet.logger
# read cached feed info # read cached feed info
sources = config.cache_sources_directory() sources = config.cache_sources_directory()
@ -125,8 +126,10 @@ def spiderFeed(feed):
else: else:
data.status = 500 data.status = 500
activity_horizon = \
time.gmtime(time.time()-86400*config.activity_threshold(feed))
# process based on the HTTP status code # process based on the HTTP status code
log = planet.logger
if data.status == 200 and data.has_key("url"): if data.status == 200 and data.has_key("url"):
data.feed['planet_http_location'] = data.url data.feed['planet_http_location'] = data.url
elif data.status == 301 and data.has_key("entries") and len(data.entries)>0: elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
@ -134,8 +137,17 @@ def spiderFeed(feed):
data.feed['planet_http_location'] = data.url data.feed['planet_http_location'] = data.url
elif data.status == 304: elif data.status == 304:
log.info("Feed %s unchanged", feed) log.info("Feed %s unchanged", feed)
if not feed_info.feed.has_key('planet_message'): return
del feed_info.feed['planet_message'] if not feed_info.feed.has_key('planet_message'):
if feed_info.feed.has_key('planet_updated'):
updated = feed_info.feed.planet_updated
if feedparser._parse_date_iso8601(updated) >= activity_horizon:
return
else:
if feed_info.feed.planet_message.startswith("no activity in"):
return
del feed_info.feed['planet_message']
elif data.status == 410: elif data.status == 410:
log.info("Feed %s gone", feed) log.info("Feed %s gone", feed)
elif data.status == 408: elif data.status == 408:
@ -146,7 +158,9 @@ def spiderFeed(feed):
log.info("Updating feed %s", feed) log.info("Updating feed %s", feed)
# if read failed, retain cached information # if read failed, retain cached information
if not data.version and feed_info.version: data.feed = feed_info.feed if not data.version and feed_info.version:
data.feed = feed_info.feed
data.bozo = feed_info.feed.get('planet_bozo','true') == 'true'
data.feed['planet_http_status'] = str(data.status) data.feed['planet_http_status'] = str(data.status)
# capture etag and last-modified information # capture etag and last-modified information
@ -212,8 +226,6 @@ def spiderFeed(feed):
# identify inactive feeds # identify inactive feeds
if config.activity_threshold(feed): if config.activity_threshold(feed):
activity_horizon = \
time.gmtime(time.time()-86400*config.activity_threshold(feed))
updated = [entry.updated_parsed for entry in data.entries updated = [entry.updated_parsed for entry in data.entries
if entry.has_key('updated_parsed')] if entry.has_key('updated_parsed')]
updated.sort() updated.sort()
@ -230,18 +242,20 @@ def spiderFeed(feed):
data.feed['planet_message'] = msg data.feed['planet_message'] = msg
# report channel level errors # report channel level errors
if data.status == 403: if data.status == 226:
data.feed['planet_message'] = "403: forbidden" if data.feed.has_key('planet_message'): del data.feed['planet_message']
elif data.status == 403:
data.feed['planet_message'] = "403: forbidden"
elif data.status == 404: elif data.status == 404:
data.feed['planet_message'] = "404: not found" data.feed['planet_message'] = "404: not found"
elif data.status == 408: elif data.status == 408:
data.feed['planet_message'] = "408: request timeout" data.feed['planet_message'] = "408: request timeout"
elif data.status == 410: elif data.status == 410:
data.feed['planet_message'] = "410: gone" data.feed['planet_message'] = "410: gone"
elif data.status == 500: elif data.status == 500:
data.feed['planet_message'] = "internal server error" data.feed['planet_message'] = "internal server error"
elif data.status >= 400: elif data.status >= 400:
data.feed['planet_message'] = "http status %s" % data.status data.feed['planet_message'] = "http status %s" % data.status
# write the feed info to the cache # write the feed info to the cache
if not os.path.exists(sources): os.makedirs(sources) if not os.path.exists(sources): os.makedirs(sources)
@ -257,4 +271,12 @@ def spiderPlanet():
planet.setTimeout(config.feed_timeout()) planet.setTimeout(config.feed_timeout())
for feed in config.subscriptions(): for feed in config.subscriptions():
spiderFeed(feed) try:
spiderFeed(feed)
except Exception,e:
import sys, traceback
type, value, tb = sys.exc_info()
log.error('Error processing %s', feed)
for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)):
log.error(line.rstrip())

View File

@ -78,11 +78,19 @@ class FilterTests(unittest.TestCase):
try: try:
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE) sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
sed.communicate() sed.communicate()
if sed.returncode != 0: if sed.returncode != 0:
logger.warn("sed is not available => can't test stripAd_yahoo") logger.warn("sed is not available => can't test stripAd_yahoo")
del FilterTests.test_stripAd_yahoo del FilterTests.test_stripAd_yahoo
try:
import libxml2
except:
logger.warn("libxml2 is not available => can't test xpath_sifter")
del FilterTests.test_xpath_filter
except ImportError: except ImportError:
logger.warn("Popen is not available => can't test filters") logger.warn("Popen is not available => can't test filters")
for method in dir(FilterTests): for method in dir(FilterTests):