Etag and last-modified support
This commit is contained in:
parent
7326419aeb
commit
14b3ccf07e
@ -45,8 +45,61 @@ def write(xdoc, out):
|
|||||||
|
|
||||||
def spiderFeed(feed):
|
def spiderFeed(feed):
|
||||||
""" Spider (fetch) a single feed """
|
""" Spider (fetch) a single feed """
|
||||||
data = feedparser.parse(feed)
|
|
||||||
if not data.feed: return
|
# read cached feed info
|
||||||
|
sources = config.cache_sources_directory()
|
||||||
|
feed_source = filename(sources, feed)
|
||||||
|
feed_info = feedparser.parse(feed_source)
|
||||||
|
if feed_info.feed.get('planet_http_status',None) == '410': return
|
||||||
|
|
||||||
|
# read feed itself
|
||||||
|
modified = None
|
||||||
|
try:
|
||||||
|
modified=time.strptime(
|
||||||
|
feed_info.feed.get('planet_http_last_modified', None))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
data = feedparser.parse(feed_info.feed.get('planet_http_location',feed),
|
||||||
|
etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
|
||||||
|
|
||||||
|
# capture http status
|
||||||
|
if not data.has_key("status"):
|
||||||
|
if data.has_key("entries") and len(data.entries)>0:
|
||||||
|
data.status = 200
|
||||||
|
elif data.bozo and data.bozo_exception.__class__.__name__=='Timeout':
|
||||||
|
data.status = 408
|
||||||
|
else:
|
||||||
|
data.status = 500
|
||||||
|
data.feed['planet_http_status'] = str(data.status)
|
||||||
|
|
||||||
|
# process based on the HTTP status code
|
||||||
|
log = planet.logger
|
||||||
|
if data.status == 301 and data.has_key("entries") and len(data.entries)>0:
|
||||||
|
log.warning("Feed has moved from <%s> to <%s>", feed, data.url)
|
||||||
|
data.feed['planet_http_location'] = data.url
|
||||||
|
elif data.status == 304:
|
||||||
|
return log.info("Feed %s unchanged", feed)
|
||||||
|
elif data.status >= 400:
|
||||||
|
feed_info.update(data.feed)
|
||||||
|
data.feed = feed_info
|
||||||
|
if data.status == 410:
|
||||||
|
log.info("Feed %s gone", feed)
|
||||||
|
elif data.status == 408:
|
||||||
|
log.warning("Feed %s timed out", feed)
|
||||||
|
else:
|
||||||
|
log.error("Error %d while updating feed %s", data.status, feed)
|
||||||
|
else:
|
||||||
|
log.info("Updating feed %s", feed)
|
||||||
|
|
||||||
|
# capture etag and last-modified information
|
||||||
|
if data.has_key('headers'):
|
||||||
|
if data.has_key('etag') and data.etag:
|
||||||
|
data.feed['planet_http_etag'] = data.etag
|
||||||
|
log.debug("E-Tag: %s", data.etag)
|
||||||
|
if data.has_key('modified') and data.modified:
|
||||||
|
data.feed['planet_http_last_modified'] = time.asctime(data.modified)
|
||||||
|
log.debug("Last Modified: %s",
|
||||||
|
data.feed['planet_http_last_modified'])
|
||||||
|
|
||||||
# capture feed and data from the planet configuration file
|
# capture feed and data from the planet configuration file
|
||||||
if not data.feed.has_key('links'): data.feed['links'] = list()
|
if not data.feed.has_key('links'): data.feed['links'] = list()
|
||||||
@ -59,7 +112,6 @@ def spiderFeed(feed):
|
|||||||
data.feed['planet_'+name] = value
|
data.feed['planet_'+name] = value
|
||||||
|
|
||||||
# write the feed info to the cache
|
# write the feed info to the cache
|
||||||
sources = config.cache_sources_directory()
|
|
||||||
if not os.path.exists(sources): os.makedirs(sources)
|
if not os.path.exists(sources): os.makedirs(sources)
|
||||||
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
|
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
|
||||||
xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
|
xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
|
||||||
@ -96,5 +148,4 @@ def spiderPlanet(configFile):
|
|||||||
planet.setTimeout(config.feed_timeout())
|
planet.setTimeout(config.feed_timeout())
|
||||||
|
|
||||||
for feed in config.feeds():
|
for feed in config.feeds():
|
||||||
log.info("Updating feed %s", feed)
|
|
||||||
spiderFeed(feed)
|
spiderFeed(feed)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user