Numereous stability fixes: recover from HTTP and FeedParser errors

This commit is contained in:
Sam Ruby 2006-10-01 05:14:29 -04:00
parent 9dbc6b5dbe
commit cacb8ffeb8
4 changed files with 52 additions and 18 deletions

View File

@ -342,7 +342,11 @@ _cp1252 = {
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
def _urljoin(base, uri):
uri = _urifixer.sub(r'\1\3', uri)
return urlparse.urljoin(base, uri)
try:
return urlparse.urljoin(base, uri)
except:
uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
return urlparse.urljoin(base, uri)
class _FeedParserMixin:
namespaces = {'': '',

View File

@ -188,14 +188,14 @@ def source(xsource, source, bozo, format):
date(xsource, 'updated', source.get('updated_parsed',time.gmtime()))
if format: source['planet_format'] = format
if not bozo == None: source['planet_bozo'] = bozo and 'true' or 'false'
# propagate planet inserted information
for key, value in source.items():
if key.startswith('planet_'):
createTextElement(xsource, key.replace('_',':',1), value)
createTextElement(xsource, 'planet:bozo', bozo and 'true' or 'false')
createTextElement(xsource, 'planet:format', format)
def reconstitute(feed, entry):
""" create an entry document from a parsed feed """
xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n')

View File

@ -99,6 +99,7 @@ def scrub(feed, data):
def spiderFeed(feed):
""" Spider (fetch) a single feed """
log = planet.logger
# read cached feed info
sources = config.cache_sources_directory()
@ -125,8 +126,10 @@ def spiderFeed(feed):
else:
data.status = 500
activity_horizon = \
time.gmtime(time.time()-86400*config.activity_threshold(feed))
# process based on the HTTP status code
log = planet.logger
if data.status == 200 and data.has_key("url"):
data.feed['planet_http_location'] = data.url
elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
@ -134,8 +137,17 @@ def spiderFeed(feed):
data.feed['planet_http_location'] = data.url
elif data.status == 304:
log.info("Feed %s unchanged", feed)
if not feed_info.feed.has_key('planet_message'): return
del feed_info.feed['planet_message']
if not feed_info.feed.has_key('planet_message'):
if feed_info.feed.has_key('planet_updated'):
updated = feed_info.feed.planet_updated
if feedparser._parse_date_iso8601(updated) >= activity_horizon:
return
else:
if feed_info.feed.planet_message.startswith("no activity in"):
return
del feed_info.feed['planet_message']
elif data.status == 410:
log.info("Feed %s gone", feed)
elif data.status == 408:
@ -146,7 +158,9 @@ def spiderFeed(feed):
log.info("Updating feed %s", feed)
# if read failed, retain cached information
if not data.version and feed_info.version: data.feed = feed_info.feed
if not data.version and feed_info.version:
data.feed = feed_info.feed
data.bozo = feed_info.feed.get('planet_bozo','true') == 'true'
data.feed['planet_http_status'] = str(data.status)
# capture etag and last-modified information
@ -212,8 +226,6 @@ def spiderFeed(feed):
# identify inactive feeds
if config.activity_threshold(feed):
activity_horizon = \
time.gmtime(time.time()-86400*config.activity_threshold(feed))
updated = [entry.updated_parsed for entry in data.entries
if entry.has_key('updated_parsed')]
updated.sort()
@ -230,18 +242,20 @@ def spiderFeed(feed):
data.feed['planet_message'] = msg
# report channel level errors
if data.status == 403:
data.feed['planet_message'] = "403: forbidden"
if data.status == 226:
if data.feed.has_key('planet_message'): del data.feed['planet_message']
elif data.status == 403:
data.feed['planet_message'] = "403: forbidden"
elif data.status == 404:
data.feed['planet_message'] = "404: not found"
data.feed['planet_message'] = "404: not found"
elif data.status == 408:
data.feed['planet_message'] = "408: request timeout"
data.feed['planet_message'] = "408: request timeout"
elif data.status == 410:
data.feed['planet_message'] = "410: gone"
data.feed['planet_message'] = "410: gone"
elif data.status == 500:
data.feed['planet_message'] = "internal server error"
data.feed['planet_message'] = "internal server error"
elif data.status >= 400:
data.feed['planet_message'] = "http status %s" % data.status
data.feed['planet_message'] = "http status %s" % data.status
# write the feed info to the cache
if not os.path.exists(sources): os.makedirs(sources)
@ -257,4 +271,12 @@ def spiderPlanet():
planet.setTimeout(config.feed_timeout())
for feed in config.subscriptions():
spiderFeed(feed)
try:
spiderFeed(feed)
except Exception,e:
import sys, traceback
type, value, tb = sys.exc_info()
log.error('Error processing %s', feed)
for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)):
log.error(line.rstrip())

View File

@ -78,11 +78,19 @@ class FilterTests(unittest.TestCase):
try:
from subprocess import Popen, PIPE
sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
sed.communicate()
if sed.returncode != 0:
logger.warn("sed is not available => can't test stripAd_yahoo")
del FilterTests.test_stripAd_yahoo
try:
import libxml2
except:
logger.warn("libxml2 is not available => can't test xpath_sifter")
del FilterTests.test_xpath_filter
except ImportError:
logger.warn("Popen is not available => can't test filters")
for method in dir(FilterTests):