Complete HttpThread refactoring
This commit is contained in:
parent
e85ae48722
commit
70f971750b
246
planet/spider.py
246
planet/spider.py
@ -344,68 +344,64 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
||||
|
||||
http_cache = config.http_cache_directory()
|
||||
h = httplib2.Http(http_cache)
|
||||
try:
|
||||
while True:
|
||||
# The non-blocking get will throw an exception when the queue
|
||||
# is empty which will terminate the thread.
|
||||
uri, feed_info = input_queue.get(block=False)
|
||||
log.info("Fetching %s via %d", uri, thread_index)
|
||||
feed = StringIO('')
|
||||
setattr(feed, 'url', uri)
|
||||
setattr(feed, 'headers',
|
||||
feedparser.FeedParserDict({'status':'500'}))
|
||||
uri, feed_info = input_queue.get(block=True)
|
||||
while uri:
|
||||
log.info("Fetching %s via %d", uri, thread_index)
|
||||
feed = StringIO('')
|
||||
setattr(feed, 'url', uri)
|
||||
setattr(feed, 'headers',
|
||||
feedparser.FeedParserDict({'status':'500'}))
|
||||
try:
|
||||
# map IRI => URI
|
||||
try:
|
||||
# map IRI => URI
|
||||
try:
|
||||
if isinstance(uri,unicode):
|
||||
idna = uri.encode('idna')
|
||||
else:
|
||||
idna = uri.decode('utf-8').encode('idna')
|
||||
if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
|
||||
except:
|
||||
log.info("unable to map %s to a URI", uri)
|
||||
idna = uri
|
||||
|
||||
# issue request
|
||||
(resp, content) = h.request(idna)
|
||||
if resp.status == 200 and resp.fromcache:
|
||||
resp.status = 304
|
||||
|
||||
# build a file-like object
|
||||
feed = StringIO(content)
|
||||
setattr(feed, 'url', resp.get('content-location', uri))
|
||||
if resp.has_key('content-encoding'):
|
||||
del resp['content-encoding']
|
||||
setattr(feed, 'headers', resp)
|
||||
except gaierror:
|
||||
log.error("Fail to resolve server name %s via %d",
|
||||
uri, thread_index)
|
||||
except BadStatusLine:
|
||||
log.error("Bad Status Line received for %s via %d",
|
||||
uri, thread_index)
|
||||
except error, e:
|
||||
if e.__class__.__name__.lower()=='timeout':
|
||||
feed.headers['status'] = '408'
|
||||
log.warn("Timeout in thread-%d", thread_index)
|
||||
if isinstance(uri,unicode):
|
||||
idna = uri.encode('idna')
|
||||
else:
|
||||
log.error("HTTP Error: %s in thread-%d",
|
||||
str(e), thread_index)
|
||||
except Exception, e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', uri)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
continue
|
||||
idna = uri.decode('utf-8').encode('idna')
|
||||
if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
|
||||
except:
|
||||
log.info("unable to map %s to a URI", uri)
|
||||
idna = uri
|
||||
|
||||
output_queue.put(block=True, item=(uri, feed_info, feed))
|
||||
|
||||
except Empty, e:
|
||||
log.info("Thread %d finished", thread_index)
|
||||
# issue request
|
||||
(resp, content) = h.request(idna)
|
||||
if resp.status == 200 and resp.fromcache:
|
||||
resp.status = 304
|
||||
|
||||
# build a file-like object
|
||||
feed = StringIO(content)
|
||||
setattr(feed, 'url', resp.get('content-location', uri))
|
||||
if resp.has_key('content-encoding'):
|
||||
del resp['content-encoding']
|
||||
setattr(feed, 'headers', resp)
|
||||
except gaierror:
|
||||
log.error("Fail to resolve server name %s via %d",
|
||||
uri, thread_index)
|
||||
except BadStatusLine:
|
||||
log.error("Bad Status Line received for %s via %d",
|
||||
uri, thread_index)
|
||||
except error, e:
|
||||
if e.__class__.__name__.lower()=='timeout':
|
||||
feed.headers['status'] = '408'
|
||||
log.warn("Timeout in thread-%d", thread_index)
|
||||
else:
|
||||
log.error("HTTP Error: %s in thread-%d",
|
||||
str(e), thread_index)
|
||||
except Exception, e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', uri)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
continue
|
||||
|
||||
output_queue.put(block=True, item=(uri, feed_info, feed))
|
||||
uri, feed_info = input_queue.get(block=True)
|
||||
|
||||
def spiderPlanet(only_if_new = False):
|
||||
""" Spider (fetch) an entire planet """
|
||||
# log = planet.getLogger(config.log_level(),config.log_format())
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
|
||||
global index
|
||||
@ -414,6 +410,7 @@ def spiderPlanet(only_if_new = False):
|
||||
timeout = config.feed_timeout()
|
||||
try:
|
||||
socket.setdefaulttimeout(float(timeout))
|
||||
log.info("Socket timeout set to %d seconds", timeout)
|
||||
except:
|
||||
try:
|
||||
from planet import timeoutsocket
|
||||
@ -422,84 +419,87 @@ def spiderPlanet(only_if_new = False):
|
||||
except:
|
||||
log.warning("Timeout set to invalid value '%s', skipping", timeout)
|
||||
|
||||
from Queue import Queue
|
||||
from threading import Thread
|
||||
|
||||
fetch_queue = Queue()
|
||||
parse_queue = Queue()
|
||||
|
||||
threads = {}
|
||||
if int(config.spider_threads()):
|
||||
from Queue import Queue
|
||||
from threading import Thread
|
||||
|
||||
fetch_queue = Queue()
|
||||
parse_queue = Queue()
|
||||
|
||||
http_cache = config.http_cache_directory()
|
||||
if not os.path.exists(http_cache):
|
||||
os.makedirs(http_cache, 0700)
|
||||
|
||||
# Load the fetch_queue with all the HTTP(S) uris.
|
||||
log.info("Building work queue")
|
||||
for uri in config.subscriptions():
|
||||
if _is_http_uri(uri):
|
||||
# read cached feed info
|
||||
sources = config.cache_sources_directory()
|
||||
feed_source = filename(sources, uri)
|
||||
feed_info = feedparser.parse(feed_source)
|
||||
|
||||
if feed_info.feed and only_if_new:
|
||||
log.info("Feed %s already in cache", uri)
|
||||
continue
|
||||
if feed_info.feed.get('planet_http_status',None) == '410':
|
||||
log.info("Feed %s gone", uri)
|
||||
continue
|
||||
|
||||
fetch_queue.put(item=((uri, feed_info)))
|
||||
|
||||
# Start all the worker threads
|
||||
threads = dict([(i, Thread(target=httpThread,
|
||||
args=(i,fetch_queue, parse_queue, log)))
|
||||
for i in range(int(config.spider_threads()))])
|
||||
for t in threads.itervalues():
|
||||
t.start()
|
||||
for i in range(int(config.spider_threads())):
|
||||
threads[i] = Thread(target=httpThread,
|
||||
args=(i,fetch_queue, parse_queue, log))
|
||||
threads[i].start()
|
||||
else:
|
||||
log.info("Building work queue")
|
||||
|
||||
# Process the results as they arrive
|
||||
while fetch_queue.qsize() or parse_queue.qsize() or threads:
|
||||
while parse_queue.qsize() == 0 and threads:
|
||||
time.sleep(0.1)
|
||||
while parse_queue.qsize():
|
||||
(uri, feed_info, feed) = parse_queue.get(False)
|
||||
try:
|
||||
# Load the fetch and parse work queues
|
||||
for uri in config.subscriptions():
|
||||
# read cached feed info
|
||||
sources = config.cache_sources_directory()
|
||||
feed_source = filename(sources, uri)
|
||||
feed_info = feedparser.parse(feed_source)
|
||||
|
||||
if int(feed.headers.status) < 300:
|
||||
data = feedparser.parse(feed)
|
||||
else:
|
||||
data = feedparser.FeedParserDict({'version':None,
|
||||
'headers':feed.headers, 'entries': [],
|
||||
'status': int(feed.headers.status)})
|
||||
if feed_info.feed and only_if_new:
|
||||
log.info("Feed %s already in cache", uri)
|
||||
continue
|
||||
if feed_info.feed.get('planet_http_status',None) == '410':
|
||||
log.info("Feed %s gone", uri)
|
||||
continue
|
||||
|
||||
writeCache(uri, feed_info, data)
|
||||
if threads and _is_http_uri(uri):
|
||||
fetch_queue.put(item=(uri, feed_info))
|
||||
else:
|
||||
parse_queue.put(item=(uri, feed_info, uri))
|
||||
|
||||
except Exception, e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', uri)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
for index in threads.keys():
|
||||
if not threads[index].isAlive():
|
||||
del threads[index]
|
||||
log.info("Finished threaded part of processing.")
|
||||
|
||||
# Mark the end of the fetch queue
|
||||
for thread in threads.keys():
|
||||
fetch_queue.put(item=(None, None))
|
||||
|
||||
# Process non-HTTP uris if we are threading, otherwise process *all* uris here.
|
||||
unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
|
||||
for feed in unthreaded_work_queue:
|
||||
try:
|
||||
spiderFeed(feed, only_if_new=only_if_new)
|
||||
except Exception,e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', feed)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
# Process the results as they arrive
|
||||
while fetch_queue.qsize() or parse_queue.qsize() or threads:
|
||||
while parse_queue.qsize() == 0 and threads:
|
||||
time.sleep(0.1)
|
||||
while parse_queue.qsize():
|
||||
(uri, feed_info, feed) = parse_queue.get(False)
|
||||
try:
|
||||
|
||||
if not hasattr(feed,'headers') or int(feed.headers.status)<300:
|
||||
options = {}
|
||||
if hasattr(feed_info,'feed'):
|
||||
options['etag'] = \
|
||||
feed_info.feed.get('planet_http_etag',None)
|
||||
try:
|
||||
modified=time.strptime(
|
||||
feed_info.feed.get('planet_http_last_modified',
|
||||
None))
|
||||
except:
|
||||
pass
|
||||
|
||||
data = feedparser.parse(feed, **options)
|
||||
else:
|
||||
data = feedparser.FeedParserDict({'version':None,
|
||||
'headers':feed.headers, 'entries': [],
|
||||
'status': int(feed.headers.status)})
|
||||
|
||||
writeCache(uri, feed_info, data)
|
||||
|
||||
except Exception, e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', uri)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
|
||||
for index in threads.keys():
|
||||
if not threads[index].isAlive():
|
||||
del threads[index]
|
||||
if not threads:
|
||||
log.info("Finished threaded part of processing.")
|
||||
|
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import unittest, os, glob, calendar, shutil, time
|
||||
from planet.spider import filename, spiderFeed, spiderPlanet
|
||||
from planet.spider import filename, spiderPlanet, writeCache
|
||||
from planet import feedparser, config
|
||||
import planet
|
||||
|
||||
@ -43,6 +43,11 @@ class SpiderTest(unittest.TestCase):
|
||||
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
|
||||
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
|
||||
|
||||
def spiderFeed(self, feed_uri):
|
||||
feed_info = feedparser.parse('<feed/>')
|
||||
data = feedparser.parse(feed_uri)
|
||||
writeCache(feed_uri, feed_info, data)
|
||||
|
||||
def verify_spiderFeed(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
files.sort()
|
||||
@ -65,13 +70,13 @@ class SpiderTest(unittest.TestCase):
|
||||
|
||||
def test_spiderFeed(self):
|
||||
config.load(configfile)
|
||||
spiderFeed(testfeed % '1b')
|
||||
self.spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def test_spiderUpdate(self):
|
||||
config.load(configfile)
|
||||
spiderFeed(testfeed % '1a')
|
||||
spiderFeed(testfeed % '1b')
|
||||
self.spiderFeed(testfeed % '1a')
|
||||
self.spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def verify_spiderPlanet(self):
|
||||
|
Loading…
Reference in New Issue
Block a user