Numereous stability fixes: recover from HTTP and FeedParser errors
This commit is contained in:
parent
9dbc6b5dbe
commit
cacb8ffeb8
@ -342,6 +342,10 @@ _cp1252 = {
|
||||
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
|
||||
def _urljoin(base, uri):
|
||||
uri = _urifixer.sub(r'\1\3', uri)
|
||||
try:
|
||||
return urlparse.urljoin(base, uri)
|
||||
except:
|
||||
uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
|
||||
return urlparse.urljoin(base, uri)
|
||||
|
||||
class _FeedParserMixin:
|
||||
|
@ -188,14 +188,14 @@ def source(xsource, source, bozo, format):
|
||||
|
||||
date(xsource, 'updated', source.get('updated_parsed',time.gmtime()))
|
||||
|
||||
if format: source['planet_format'] = format
|
||||
if not bozo == None: source['planet_bozo'] = bozo and 'true' or 'false'
|
||||
|
||||
# propagate planet inserted information
|
||||
for key, value in source.items():
|
||||
if key.startswith('planet_'):
|
||||
createTextElement(xsource, key.replace('_',':',1), value)
|
||||
|
||||
createTextElement(xsource, 'planet:bozo', bozo and 'true' or 'false')
|
||||
createTextElement(xsource, 'planet:format', format)
|
||||
|
||||
def reconstitute(feed, entry):
|
||||
""" create an entry document from a parsed feed """
|
||||
xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n')
|
||||
|
@ -99,6 +99,7 @@ def scrub(feed, data):
|
||||
|
||||
def spiderFeed(feed):
|
||||
""" Spider (fetch) a single feed """
|
||||
log = planet.logger
|
||||
|
||||
# read cached feed info
|
||||
sources = config.cache_sources_directory()
|
||||
@ -125,8 +126,10 @@ def spiderFeed(feed):
|
||||
else:
|
||||
data.status = 500
|
||||
|
||||
activity_horizon = \
|
||||
time.gmtime(time.time()-86400*config.activity_threshold(feed))
|
||||
|
||||
# process based on the HTTP status code
|
||||
log = planet.logger
|
||||
if data.status == 200 and data.has_key("url"):
|
||||
data.feed['planet_http_location'] = data.url
|
||||
elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
|
||||
@ -134,8 +137,17 @@ def spiderFeed(feed):
|
||||
data.feed['planet_http_location'] = data.url
|
||||
elif data.status == 304:
|
||||
log.info("Feed %s unchanged", feed)
|
||||
if not feed_info.feed.has_key('planet_message'): return
|
||||
|
||||
if not feed_info.feed.has_key('planet_message'):
|
||||
if feed_info.feed.has_key('planet_updated'):
|
||||
updated = feed_info.feed.planet_updated
|
||||
if feedparser._parse_date_iso8601(updated) >= activity_horizon:
|
||||
return
|
||||
else:
|
||||
if feed_info.feed.planet_message.startswith("no activity in"):
|
||||
return
|
||||
del feed_info.feed['planet_message']
|
||||
|
||||
elif data.status == 410:
|
||||
log.info("Feed %s gone", feed)
|
||||
elif data.status == 408:
|
||||
@ -146,7 +158,9 @@ def spiderFeed(feed):
|
||||
log.info("Updating feed %s", feed)
|
||||
|
||||
# if read failed, retain cached information
|
||||
if not data.version and feed_info.version: data.feed = feed_info.feed
|
||||
if not data.version and feed_info.version:
|
||||
data.feed = feed_info.feed
|
||||
data.bozo = feed_info.feed.get('planet_bozo','true') == 'true'
|
||||
data.feed['planet_http_status'] = str(data.status)
|
||||
|
||||
# capture etag and last-modified information
|
||||
@ -212,8 +226,6 @@ def spiderFeed(feed):
|
||||
|
||||
# identify inactive feeds
|
||||
if config.activity_threshold(feed):
|
||||
activity_horizon = \
|
||||
time.gmtime(time.time()-86400*config.activity_threshold(feed))
|
||||
updated = [entry.updated_parsed for entry in data.entries
|
||||
if entry.has_key('updated_parsed')]
|
||||
updated.sort()
|
||||
@ -230,7 +242,9 @@ def spiderFeed(feed):
|
||||
data.feed['planet_message'] = msg
|
||||
|
||||
# report channel level errors
|
||||
if data.status == 403:
|
||||
if data.status == 226:
|
||||
if data.feed.has_key('planet_message'): del data.feed['planet_message']
|
||||
elif data.status == 403:
|
||||
data.feed['planet_message'] = "403: forbidden"
|
||||
elif data.status == 404:
|
||||
data.feed['planet_message'] = "404: not found"
|
||||
@ -257,4 +271,12 @@ def spiderPlanet():
|
||||
planet.setTimeout(config.feed_timeout())
|
||||
|
||||
for feed in config.subscriptions():
|
||||
try:
|
||||
spiderFeed(feed)
|
||||
except Exception,e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', feed)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
|
@ -78,11 +78,19 @@ class FilterTests(unittest.TestCase):
|
||||
|
||||
try:
|
||||
from subprocess import Popen, PIPE
|
||||
|
||||
sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
|
||||
sed.communicate()
|
||||
if sed.returncode != 0:
|
||||
logger.warn("sed is not available => can't test stripAd_yahoo")
|
||||
del FilterTests.test_stripAd_yahoo
|
||||
|
||||
try:
|
||||
import libxml2
|
||||
except:
|
||||
logger.warn("libxml2 is not available => can't test xpath_sifter")
|
||||
del FilterTests.test_xpath_filter
|
||||
|
||||
except ImportError:
|
||||
logger.warn("Popen is not available => can't test filters")
|
||||
for method in dir(FilterTests):
|
||||
|
Loading…
x
Reference in New Issue
Block a user