Numereous stability fixes: recover from HTTP and FeedParser errors
This commit is contained in:
parent
9dbc6b5dbe
commit
cacb8ffeb8
@ -342,7 +342,11 @@ _cp1252 = {
|
|||||||
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
|
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
|
||||||
def _urljoin(base, uri):
|
def _urljoin(base, uri):
|
||||||
uri = _urifixer.sub(r'\1\3', uri)
|
uri = _urifixer.sub(r'\1\3', uri)
|
||||||
return urlparse.urljoin(base, uri)
|
try:
|
||||||
|
return urlparse.urljoin(base, uri)
|
||||||
|
except:
|
||||||
|
uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)])
|
||||||
|
return urlparse.urljoin(base, uri)
|
||||||
|
|
||||||
class _FeedParserMixin:
|
class _FeedParserMixin:
|
||||||
namespaces = {'': '',
|
namespaces = {'': '',
|
||||||
|
@ -188,14 +188,14 @@ def source(xsource, source, bozo, format):
|
|||||||
|
|
||||||
date(xsource, 'updated', source.get('updated_parsed',time.gmtime()))
|
date(xsource, 'updated', source.get('updated_parsed',time.gmtime()))
|
||||||
|
|
||||||
|
if format: source['planet_format'] = format
|
||||||
|
if not bozo == None: source['planet_bozo'] = bozo and 'true' or 'false'
|
||||||
|
|
||||||
# propagate planet inserted information
|
# propagate planet inserted information
|
||||||
for key, value in source.items():
|
for key, value in source.items():
|
||||||
if key.startswith('planet_'):
|
if key.startswith('planet_'):
|
||||||
createTextElement(xsource, key.replace('_',':',1), value)
|
createTextElement(xsource, key.replace('_',':',1), value)
|
||||||
|
|
||||||
createTextElement(xsource, 'planet:bozo', bozo and 'true' or 'false')
|
|
||||||
createTextElement(xsource, 'planet:format', format)
|
|
||||||
|
|
||||||
def reconstitute(feed, entry):
|
def reconstitute(feed, entry):
|
||||||
""" create an entry document from a parsed feed """
|
""" create an entry document from a parsed feed """
|
||||||
xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n')
|
xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n')
|
||||||
|
@ -99,6 +99,7 @@ def scrub(feed, data):
|
|||||||
|
|
||||||
def spiderFeed(feed):
|
def spiderFeed(feed):
|
||||||
""" Spider (fetch) a single feed """
|
""" Spider (fetch) a single feed """
|
||||||
|
log = planet.logger
|
||||||
|
|
||||||
# read cached feed info
|
# read cached feed info
|
||||||
sources = config.cache_sources_directory()
|
sources = config.cache_sources_directory()
|
||||||
@ -125,8 +126,10 @@ def spiderFeed(feed):
|
|||||||
else:
|
else:
|
||||||
data.status = 500
|
data.status = 500
|
||||||
|
|
||||||
|
activity_horizon = \
|
||||||
|
time.gmtime(time.time()-86400*config.activity_threshold(feed))
|
||||||
|
|
||||||
# process based on the HTTP status code
|
# process based on the HTTP status code
|
||||||
log = planet.logger
|
|
||||||
if data.status == 200 and data.has_key("url"):
|
if data.status == 200 and data.has_key("url"):
|
||||||
data.feed['planet_http_location'] = data.url
|
data.feed['planet_http_location'] = data.url
|
||||||
elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
|
elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
|
||||||
@ -134,8 +137,17 @@ def spiderFeed(feed):
|
|||||||
data.feed['planet_http_location'] = data.url
|
data.feed['planet_http_location'] = data.url
|
||||||
elif data.status == 304:
|
elif data.status == 304:
|
||||||
log.info("Feed %s unchanged", feed)
|
log.info("Feed %s unchanged", feed)
|
||||||
if not feed_info.feed.has_key('planet_message'): return
|
|
||||||
del feed_info.feed['planet_message']
|
if not feed_info.feed.has_key('planet_message'):
|
||||||
|
if feed_info.feed.has_key('planet_updated'):
|
||||||
|
updated = feed_info.feed.planet_updated
|
||||||
|
if feedparser._parse_date_iso8601(updated) >= activity_horizon:
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
if feed_info.feed.planet_message.startswith("no activity in"):
|
||||||
|
return
|
||||||
|
del feed_info.feed['planet_message']
|
||||||
|
|
||||||
elif data.status == 410:
|
elif data.status == 410:
|
||||||
log.info("Feed %s gone", feed)
|
log.info("Feed %s gone", feed)
|
||||||
elif data.status == 408:
|
elif data.status == 408:
|
||||||
@ -146,7 +158,9 @@ def spiderFeed(feed):
|
|||||||
log.info("Updating feed %s", feed)
|
log.info("Updating feed %s", feed)
|
||||||
|
|
||||||
# if read failed, retain cached information
|
# if read failed, retain cached information
|
||||||
if not data.version and feed_info.version: data.feed = feed_info.feed
|
if not data.version and feed_info.version:
|
||||||
|
data.feed = feed_info.feed
|
||||||
|
data.bozo = feed_info.feed.get('planet_bozo','true') == 'true'
|
||||||
data.feed['planet_http_status'] = str(data.status)
|
data.feed['planet_http_status'] = str(data.status)
|
||||||
|
|
||||||
# capture etag and last-modified information
|
# capture etag and last-modified information
|
||||||
@ -212,8 +226,6 @@ def spiderFeed(feed):
|
|||||||
|
|
||||||
# identify inactive feeds
|
# identify inactive feeds
|
||||||
if config.activity_threshold(feed):
|
if config.activity_threshold(feed):
|
||||||
activity_horizon = \
|
|
||||||
time.gmtime(time.time()-86400*config.activity_threshold(feed))
|
|
||||||
updated = [entry.updated_parsed for entry in data.entries
|
updated = [entry.updated_parsed for entry in data.entries
|
||||||
if entry.has_key('updated_parsed')]
|
if entry.has_key('updated_parsed')]
|
||||||
updated.sort()
|
updated.sort()
|
||||||
@ -230,18 +242,20 @@ def spiderFeed(feed):
|
|||||||
data.feed['planet_message'] = msg
|
data.feed['planet_message'] = msg
|
||||||
|
|
||||||
# report channel level errors
|
# report channel level errors
|
||||||
if data.status == 403:
|
if data.status == 226:
|
||||||
data.feed['planet_message'] = "403: forbidden"
|
if data.feed.has_key('planet_message'): del data.feed['planet_message']
|
||||||
|
elif data.status == 403:
|
||||||
|
data.feed['planet_message'] = "403: forbidden"
|
||||||
elif data.status == 404:
|
elif data.status == 404:
|
||||||
data.feed['planet_message'] = "404: not found"
|
data.feed['planet_message'] = "404: not found"
|
||||||
elif data.status == 408:
|
elif data.status == 408:
|
||||||
data.feed['planet_message'] = "408: request timeout"
|
data.feed['planet_message'] = "408: request timeout"
|
||||||
elif data.status == 410:
|
elif data.status == 410:
|
||||||
data.feed['planet_message'] = "410: gone"
|
data.feed['planet_message'] = "410: gone"
|
||||||
elif data.status == 500:
|
elif data.status == 500:
|
||||||
data.feed['planet_message'] = "internal server error"
|
data.feed['planet_message'] = "internal server error"
|
||||||
elif data.status >= 400:
|
elif data.status >= 400:
|
||||||
data.feed['planet_message'] = "http status %s" % data.status
|
data.feed['planet_message'] = "http status %s" % data.status
|
||||||
|
|
||||||
# write the feed info to the cache
|
# write the feed info to the cache
|
||||||
if not os.path.exists(sources): os.makedirs(sources)
|
if not os.path.exists(sources): os.makedirs(sources)
|
||||||
@ -257,4 +271,12 @@ def spiderPlanet():
|
|||||||
planet.setTimeout(config.feed_timeout())
|
planet.setTimeout(config.feed_timeout())
|
||||||
|
|
||||||
for feed in config.subscriptions():
|
for feed in config.subscriptions():
|
||||||
spiderFeed(feed)
|
try:
|
||||||
|
spiderFeed(feed)
|
||||||
|
except Exception,e:
|
||||||
|
import sys, traceback
|
||||||
|
type, value, tb = sys.exc_info()
|
||||||
|
log.error('Error processing %s', feed)
|
||||||
|
for line in (traceback.format_exception_only(type, value) +
|
||||||
|
traceback.format_tb(tb)):
|
||||||
|
log.error(line.rstrip())
|
||||||
|
@ -78,11 +78,19 @@ class FilterTests(unittest.TestCase):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from subprocess import Popen, PIPE
|
from subprocess import Popen, PIPE
|
||||||
|
|
||||||
sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
|
sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
|
||||||
sed.communicate()
|
sed.communicate()
|
||||||
if sed.returncode != 0:
|
if sed.returncode != 0:
|
||||||
logger.warn("sed is not available => can't test stripAd_yahoo")
|
logger.warn("sed is not available => can't test stripAd_yahoo")
|
||||||
del FilterTests.test_stripAd_yahoo
|
del FilterTests.test_stripAd_yahoo
|
||||||
|
|
||||||
|
try:
|
||||||
|
import libxml2
|
||||||
|
except:
|
||||||
|
logger.warn("libxml2 is not available => can't test xpath_sifter")
|
||||||
|
del FilterTests.test_xpath_filter
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warn("Popen is not available => can't test filters")
|
logger.warn("Popen is not available => can't test filters")
|
||||||
for method in dir(FilterTests):
|
for method in dir(FilterTests):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user