Hash content to determine if it was modified
This commit is contained in:
parent
70f971750b
commit
c20acf9944
@ -100,7 +100,16 @@ use for logging output. Note: this configuration value is processed
|
|||||||
<dd>Number of items to take from new feeds</dd>
|
<dd>Number of items to take from new feeds</dd>
|
||||||
<dt><ins>spider_threads</ins></dt>
|
<dt><ins>spider_threads</ins></dt>
|
||||||
<dd>The number of threads to use when spidering. When set to 0, the default,
|
<dd>The number of threads to use when spidering. When set to 0, the default,
|
||||||
no threads are used and spidering follows the traditional algorithm.</dd>
|
no threads are used and spidering follows the traditional algorithm.</dd>
|
||||||
|
<dt><ins>spider_threads</ins></dt>
|
||||||
|
<dd>The number of threads to use when spidering. When set to 0, the default,
|
||||||
|
no threads are used and spidering follows the traditional algorithm.</dd>
|
||||||
|
<dt><ins>http_cache_directory</ins></dt>
|
||||||
|
<dd>If <code>spider_threads</code> is specified, you can also specify a
|
||||||
|
directory to be used for an additional HTTP cache to front end the Venus
|
||||||
|
cache. If specified as a relative path, it is evaluated relative to the
|
||||||
|
<code>cache_directory</code>.</dd>
|
||||||
|
<code>
|
||||||
</dl>
|
</dl>
|
||||||
</blockquote>
|
</blockquote>
|
||||||
|
|
||||||
|
@ -285,13 +285,13 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
|
|||||||
|
|
||||||
def http_cache_directory():
|
def http_cache_directory():
|
||||||
if parser.has_option('Planet', 'http_cache_directory'):
|
if parser.has_option('Planet', 'http_cache_directory'):
|
||||||
parser.get('Planet', 'http_cache_directory')
|
os.path.join(cache_directory(),
|
||||||
else:
|
parser.get('Planet', 'http_cache_directory'))
|
||||||
return os.path.join(cache_directory(), 'sources/http')
|
|
||||||
|
|
||||||
def cache_sources_directory():
|
def cache_sources_directory():
|
||||||
if parser.has_option('Planet', 'cache_sources_directory'):
|
if parser.has_option('Planet', 'cache_sources_directory'):
|
||||||
parser.get('Planet', 'cache_sources_directory')
|
return os.path.join(cache_directory(),
|
||||||
|
parser.get('Planet', 'cache_sources_directory'))
|
||||||
else:
|
else:
|
||||||
return os.path.join(cache_directory(), 'sources')
|
return os.path.join(cache_directory(), 'sources')
|
||||||
|
|
||||||
|
@ -105,7 +105,7 @@ def links(xentry, entry):
|
|||||||
if entry.has_key('link'):
|
if entry.has_key('link'):
|
||||||
entry['links'].append({'rel':'alternate', 'href':entry.link})
|
entry['links'].append({'rel':'alternate', 'href':entry.link})
|
||||||
xdoc = xentry.ownerDocument
|
xdoc = xentry.ownerDocument
|
||||||
for link in entry.links:
|
for link in entry['links']:
|
||||||
if not 'href' in link.keys(): continue
|
if not 'href' in link.keys(): continue
|
||||||
xlink = xdoc.createElement('link')
|
xlink = xdoc.createElement('link')
|
||||||
xlink.setAttribute('href', link.get('href'))
|
xlink.setAttribute('href', link.get('href'))
|
||||||
|
@ -121,36 +121,6 @@ def _is_http_uri(uri):
|
|||||||
parsed = urlparse.urlparse(uri)
|
parsed = urlparse.urlparse(uri)
|
||||||
return parsed[0] in ['http', 'https']
|
return parsed[0] in ['http', 'https']
|
||||||
|
|
||||||
def spiderFeed(feed_uri, only_if_new=0):
|
|
||||||
""" Spider (fetch) a single feed """
|
|
||||||
log = planet.logger
|
|
||||||
|
|
||||||
# read cached feed info
|
|
||||||
sources = config.cache_sources_directory()
|
|
||||||
if not os.path.exists(sources):
|
|
||||||
os.makedirs(sources, 0700)
|
|
||||||
|
|
||||||
feed_source = filename(sources, feed_uri)
|
|
||||||
feed_info = feedparser.parse(feed_source)
|
|
||||||
if feed_info.feed and only_if_new:
|
|
||||||
log.info("Feed %s already in cache", feed_uri)
|
|
||||||
return
|
|
||||||
if feed_info.feed.get('planet_http_status',None) == '410':
|
|
||||||
log.info("Feed %s gone", feed_uri)
|
|
||||||
return
|
|
||||||
|
|
||||||
# read feed itself
|
|
||||||
modified = None
|
|
||||||
try:
|
|
||||||
modified=time.strptime(
|
|
||||||
feed_info.feed.get('planet_http_last_modified', None))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
data = feedparser.parse(feed_info.feed.get('planet_http_location',feed_uri),
|
|
||||||
etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
|
|
||||||
|
|
||||||
writeCache(feed_uri, feed_info, data)
|
|
||||||
|
|
||||||
def writeCache(feed_uri, feed_info, data):
|
def writeCache(feed_uri, feed_info, data):
|
||||||
log = planet.logger
|
log = planet.logger
|
||||||
sources = config.cache_sources_directory()
|
sources = config.cache_sources_directory()
|
||||||
@ -159,7 +129,8 @@ def writeCache(feed_uri, feed_info, data):
|
|||||||
if not data.has_key("status"):
|
if not data.has_key("status"):
|
||||||
if data.has_key("entries") and len(data.entries)>0:
|
if data.has_key("entries") and len(data.entries)>0:
|
||||||
data.status = 200
|
data.status = 200
|
||||||
elif data.bozo and data.bozo_exception.__class__.__name__.lower()=='timeout':
|
elif data.bozo and \
|
||||||
|
data.bozo_exception.__class__.__name__.lower()=='timeout':
|
||||||
data.status = 408
|
data.status = 408
|
||||||
else:
|
else:
|
||||||
data.status = 500
|
data.status = 500
|
||||||
@ -210,11 +181,16 @@ def writeCache(feed_uri, feed_info, data):
|
|||||||
if data.has_key('headers'):
|
if data.has_key('headers'):
|
||||||
if data.has_key('etag') and data.etag:
|
if data.has_key('etag') and data.etag:
|
||||||
data.feed['planet_http_etag'] = data.etag
|
data.feed['planet_http_etag'] = data.etag
|
||||||
log.debug("E-Tag: %s", data.etag)
|
elif data.headers.has_key('etag') and data.headers['etag']:
|
||||||
if data.has_key('modified') and data.modified:
|
data.feed['planet_http_etag'] = data.headers['etag']
|
||||||
|
|
||||||
|
if data.headers.has_key('last-modified'):
|
||||||
|
data.feed['planet_http_last_modified']=data.headers['last-modified']
|
||||||
|
elif data.has_key('modified') and data.modified:
|
||||||
data.feed['planet_http_last_modified'] = time.asctime(data.modified)
|
data.feed['planet_http_last_modified'] = time.asctime(data.modified)
|
||||||
log.debug("Last Modified: %s",
|
|
||||||
data.feed['planet_http_last_modified'])
|
if data.headers.has_key('-content-hash'):
|
||||||
|
data.feed['planet_content_hash'] = data.headers['-content-hash']
|
||||||
|
|
||||||
# capture feed and data from the planet configuration file
|
# capture feed and data from the planet configuration file
|
||||||
if data.version:
|
if data.version:
|
||||||
@ -337,13 +313,11 @@ def writeCache(feed_uri, feed_info, data):
|
|||||||
xdoc.unlink()
|
xdoc.unlink()
|
||||||
|
|
||||||
def httpThread(thread_index, input_queue, output_queue, log):
|
def httpThread(thread_index, input_queue, output_queue, log):
|
||||||
from Queue import Empty
|
import httplib2, md5
|
||||||
import httplib2
|
|
||||||
from socket import gaierror, error
|
from socket import gaierror, error
|
||||||
from httplib import BadStatusLine
|
from httplib import BadStatusLine
|
||||||
|
|
||||||
http_cache = config.http_cache_directory()
|
h = httplib2.Http(config.http_cache_directory())
|
||||||
h = httplib2.Http(http_cache)
|
|
||||||
uri, feed_info = input_queue.get(block=True)
|
uri, feed_info = input_queue.get(block=True)
|
||||||
while uri:
|
while uri:
|
||||||
log.info("Fetching %s via %d", uri, thread_index)
|
log.info("Fetching %s via %d", uri, thread_index)
|
||||||
@ -363,10 +337,26 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
|||||||
log.info("unable to map %s to a URI", uri)
|
log.info("unable to map %s to a URI", uri)
|
||||||
idna = uri
|
idna = uri
|
||||||
|
|
||||||
|
# cache control headers
|
||||||
|
headers = {}
|
||||||
|
if feed_info.feed.has_key('planet_http_etag'):
|
||||||
|
headers['If-None-Match'] = feed_info.feed['planet_http_etag']
|
||||||
|
if feed_info.feed.has_key('planet_http_last_modified'):
|
||||||
|
headers['If-Modified-Since'] = \
|
||||||
|
feed_info.feed['planet_http_last_modified']
|
||||||
|
|
||||||
# issue request
|
# issue request
|
||||||
(resp, content) = h.request(idna)
|
(resp, content) = h.request(idna, 'GET', headers=headers)
|
||||||
if resp.status == 200 and resp.fromcache:
|
|
||||||
resp.status = 304
|
# unchanged detection
|
||||||
|
resp['-content-hash'] = md5.new(content or '').hexdigest()
|
||||||
|
if resp.status == 200:
|
||||||
|
if resp.fromcache:
|
||||||
|
resp.status = 304
|
||||||
|
elif feed_info.feed.has_key('planet_content_hash') and \
|
||||||
|
feed_info.feed['planet_content_hash'] == \
|
||||||
|
resp['-content-hash']:
|
||||||
|
resp.status = 304
|
||||||
|
|
||||||
# build a file-like object
|
# build a file-like object
|
||||||
feed = StringIO(content)
|
feed = StringIO(content)
|
||||||
@ -385,8 +375,7 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
|||||||
feed.headers['status'] = '408'
|
feed.headers['status'] = '408'
|
||||||
log.warn("Timeout in thread-%d", thread_index)
|
log.warn("Timeout in thread-%d", thread_index)
|
||||||
else:
|
else:
|
||||||
log.error("HTTP Error: %s in thread-%d",
|
log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
|
||||||
str(e), thread_index)
|
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
import sys, traceback
|
import sys, traceback
|
||||||
type, value, tb = sys.exc_info()
|
type, value, tb = sys.exc_info()
|
||||||
@ -428,7 +417,7 @@ def spiderPlanet(only_if_new = False):
|
|||||||
threads = {}
|
threads = {}
|
||||||
if int(config.spider_threads()):
|
if int(config.spider_threads()):
|
||||||
http_cache = config.http_cache_directory()
|
http_cache = config.http_cache_directory()
|
||||||
if not os.path.exists(http_cache):
|
if http_cache and not os.path.exists(http_cache):
|
||||||
os.makedirs(http_cache, 0700)
|
os.makedirs(http_cache, 0700)
|
||||||
|
|
||||||
# Start all the worker threads
|
# Start all the worker threads
|
||||||
@ -484,9 +473,9 @@ def spiderPlanet(only_if_new = False):
|
|||||||
|
|
||||||
data = feedparser.parse(feed, **options)
|
data = feedparser.parse(feed, **options)
|
||||||
else:
|
else:
|
||||||
data = feedparser.FeedParserDict({'version':None,
|
data = feedparser.FeedParserDict({'version': None,
|
||||||
'headers':feed.headers, 'entries': [],
|
'headers': feed.headers, 'entries': [], 'feed': {},
|
||||||
'status': int(feed.headers.status)})
|
'bozo': 0, 'status': int(feed.headers.status)})
|
||||||
|
|
||||||
writeCache(uri, feed_info, data)
|
writeCache(uri, feed_info, data)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user