Complete HttpThread refactoring

This commit is contained in:
Sam Ruby 2006-11-21 09:11:52 -05:00
parent e85ae48722
commit 70f971750b
2 changed files with 132 additions and 127 deletions

View File

@ -344,11 +344,8 @@ def httpThread(thread_index, input_queue, output_queue, log):
http_cache = config.http_cache_directory() http_cache = config.http_cache_directory()
h = httplib2.Http(http_cache) h = httplib2.Http(http_cache)
try: uri, feed_info = input_queue.get(block=True)
while True: while uri:
# The non-blocking get will throw an exception when the queue
# is empty which will terminate the thread.
uri, feed_info = input_queue.get(block=False)
log.info("Fetching %s via %d", uri, thread_index) log.info("Fetching %s via %d", uri, thread_index)
feed = StringIO('') feed = StringIO('')
setattr(feed, 'url', uri) setattr(feed, 'url', uri)
@ -400,12 +397,11 @@ def httpThread(thread_index, input_queue, output_queue, log):
continue continue
output_queue.put(block=True, item=(uri, feed_info, feed)) output_queue.put(block=True, item=(uri, feed_info, feed))
uri, feed_info = input_queue.get(block=True)
except Empty, e:
log.info("Thread %d finished", thread_index)
def spiderPlanet(only_if_new = False): def spiderPlanet(only_if_new = False):
""" Spider (fetch) an entire planet """ """ Spider (fetch) an entire planet """
# log = planet.getLogger(config.log_level(),config.log_format())
log = planet.getLogger(config.log_level(),config.log_format()) log = planet.getLogger(config.log_level(),config.log_format())
global index global index
@ -414,6 +410,7 @@ def spiderPlanet(only_if_new = False):
timeout = config.feed_timeout() timeout = config.feed_timeout()
try: try:
socket.setdefaulttimeout(float(timeout)) socket.setdefaulttimeout(float(timeout))
log.info("Socket timeout set to %d seconds", timeout)
except: except:
try: try:
from planet import timeoutsocket from planet import timeoutsocket
@ -422,21 +419,28 @@ def spiderPlanet(only_if_new = False):
except: except:
log.warning("Timeout set to invalid value '%s', skipping", timeout) log.warning("Timeout set to invalid value '%s', skipping", timeout)
if int(config.spider_threads()):
from Queue import Queue from Queue import Queue
from threading import Thread from threading import Thread
fetch_queue = Queue() fetch_queue = Queue()
parse_queue = Queue() parse_queue = Queue()
threads = {}
if int(config.spider_threads()):
http_cache = config.http_cache_directory() http_cache = config.http_cache_directory()
if not os.path.exists(http_cache): if not os.path.exists(http_cache):
os.makedirs(http_cache, 0700) os.makedirs(http_cache, 0700)
# Load the fetch_queue with all the HTTP(S) uris. # Start all the worker threads
for i in range(int(config.spider_threads())):
threads[i] = Thread(target=httpThread,
args=(i,fetch_queue, parse_queue, log))
threads[i].start()
else:
log.info("Building work queue") log.info("Building work queue")
# Load the fetch and parse work queues
for uri in config.subscriptions(): for uri in config.subscriptions():
if _is_http_uri(uri):
# read cached feed info # read cached feed info
sources = config.cache_sources_directory() sources = config.cache_sources_directory()
feed_source = filename(sources, uri) feed_source = filename(sources, uri)
@ -449,14 +453,14 @@ def spiderPlanet(only_if_new = False):
log.info("Feed %s gone", uri) log.info("Feed %s gone", uri)
continue continue
fetch_queue.put(item=((uri, feed_info))) if threads and _is_http_uri(uri):
fetch_queue.put(item=(uri, feed_info))
else:
parse_queue.put(item=(uri, feed_info, uri))
# Start all the worker threads # Mark the end of the fetch queue
threads = dict([(i, Thread(target=httpThread, for thread in threads.keys():
args=(i,fetch_queue, parse_queue, log))) fetch_queue.put(item=(None, None))
for i in range(int(config.spider_threads()))])
for t in threads.itervalues():
t.start()
# Process the results as they arrive # Process the results as they arrive
while fetch_queue.qsize() or parse_queue.qsize() or threads: while fetch_queue.qsize() or parse_queue.qsize() or threads:
@ -466,8 +470,19 @@ def spiderPlanet(only_if_new = False):
(uri, feed_info, feed) = parse_queue.get(False) (uri, feed_info, feed) = parse_queue.get(False)
try: try:
if int(feed.headers.status) < 300: if not hasattr(feed,'headers') or int(feed.headers.status)<300:
data = feedparser.parse(feed) options = {}
if hasattr(feed_info,'feed'):
options['etag'] = \
feed_info.feed.get('planet_http_etag',None)
try:
modified=time.strptime(
feed_info.feed.get('planet_http_last_modified',
None))
except:
pass
data = feedparser.parse(feed, **options)
else: else:
data = feedparser.FeedParserDict({'version':None, data = feedparser.FeedParserDict({'version':None,
'headers':feed.headers, 'entries': [], 'headers':feed.headers, 'entries': [],
@ -482,24 +497,9 @@ def spiderPlanet(only_if_new = False):
for line in (traceback.format_exception_only(type, value) + for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)): traceback.format_tb(tb)):
log.error(line.rstrip()) log.error(line.rstrip())
for index in threads.keys(): for index in threads.keys():
if not threads[index].isAlive(): if not threads[index].isAlive():
del threads[index] del threads[index]
if not threads:
log.info("Finished threaded part of processing.") log.info("Finished threaded part of processing.")
# Process non-HTTP uris if we are threading, otherwise process *all* uris here.
unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
for feed in unthreaded_work_queue:
try:
spiderFeed(feed, only_if_new=only_if_new)
except Exception,e:
import sys, traceback
type, value, tb = sys.exc_info()
log.error('Error processing %s', feed)
for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)):
log.error(line.rstrip())

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
import unittest, os, glob, calendar, shutil, time import unittest, os, glob, calendar, shutil, time
from planet.spider import filename, spiderFeed, spiderPlanet from planet.spider import filename, spiderPlanet, writeCache
from planet import feedparser, config from planet import feedparser, config
import planet import planet
@ -43,6 +43,11 @@ class SpiderTest(unittest.TestCase):
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'), self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
filename('.', u'http://www.\u8a79\u59c6\u65af.com/')) filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
def spiderFeed(self, feed_uri):
feed_info = feedparser.parse('<feed/>')
data = feedparser.parse(feed_uri)
writeCache(feed_uri, feed_info, data)
def verify_spiderFeed(self): def verify_spiderFeed(self):
files = glob.glob(workdir+"/*") files = glob.glob(workdir+"/*")
files.sort() files.sort()
@ -65,13 +70,13 @@ class SpiderTest(unittest.TestCase):
def test_spiderFeed(self): def test_spiderFeed(self):
config.load(configfile) config.load(configfile)
spiderFeed(testfeed % '1b') self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed() self.verify_spiderFeed()
def test_spiderUpdate(self): def test_spiderUpdate(self):
config.load(configfile) config.load(configfile)
spiderFeed(testfeed % '1a') self.spiderFeed(testfeed % '1a')
spiderFeed(testfeed % '1b') self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed() self.verify_spiderFeed()
def verify_spiderPlanet(self): def verify_spiderPlanet(self):