Complete HttpThread refactoring

This commit is contained in:
Sam Ruby 2006-11-21 09:11:52 -05:00
parent e85ae48722
commit 70f971750b
2 changed files with 132 additions and 127 deletions

View File

@ -344,11 +344,8 @@ def httpThread(thread_index, input_queue, output_queue, log):
http_cache = config.http_cache_directory()
h = httplib2.Http(http_cache)
try:
while True:
# The non-blocking get will throw an exception when the queue
# is empty which will terminate the thread.
uri, feed_info = input_queue.get(block=False)
uri, feed_info = input_queue.get(block=True)
while uri:
log.info("Fetching %s via %d", uri, thread_index)
feed = StringIO('')
setattr(feed, 'url', uri)
@ -400,12 +397,11 @@ def httpThread(thread_index, input_queue, output_queue, log):
continue
output_queue.put(block=True, item=(uri, feed_info, feed))
except Empty, e:
log.info("Thread %d finished", thread_index)
uri, feed_info = input_queue.get(block=True)
def spiderPlanet(only_if_new = False):
""" Spider (fetch) an entire planet """
# log = planet.getLogger(config.log_level(),config.log_format())
log = planet.getLogger(config.log_level(),config.log_format())
global index
@ -414,6 +410,7 @@ def spiderPlanet(only_if_new = False):
timeout = config.feed_timeout()
try:
socket.setdefaulttimeout(float(timeout))
log.info("Socket timeout set to %d seconds", timeout)
except:
try:
from planet import timeoutsocket
@ -422,21 +419,28 @@ def spiderPlanet(only_if_new = False):
except:
log.warning("Timeout set to invalid value '%s', skipping", timeout)
if int(config.spider_threads()):
from Queue import Queue
from threading import Thread
fetch_queue = Queue()
parse_queue = Queue()
threads = {}
if int(config.spider_threads()):
http_cache = config.http_cache_directory()
if not os.path.exists(http_cache):
os.makedirs(http_cache, 0700)
# Load the fetch_queue with all the HTTP(S) uris.
# Start all the worker threads
for i in range(int(config.spider_threads())):
threads[i] = Thread(target=httpThread,
args=(i,fetch_queue, parse_queue, log))
threads[i].start()
else:
log.info("Building work queue")
# Load the fetch and parse work queues
for uri in config.subscriptions():
if _is_http_uri(uri):
# read cached feed info
sources = config.cache_sources_directory()
feed_source = filename(sources, uri)
@ -449,14 +453,14 @@ def spiderPlanet(only_if_new = False):
log.info("Feed %s gone", uri)
continue
fetch_queue.put(item=((uri, feed_info)))
if threads and _is_http_uri(uri):
fetch_queue.put(item=(uri, feed_info))
else:
parse_queue.put(item=(uri, feed_info, uri))
# Start all the worker threads
threads = dict([(i, Thread(target=httpThread,
args=(i,fetch_queue, parse_queue, log)))
for i in range(int(config.spider_threads()))])
for t in threads.itervalues():
t.start()
# Mark the end of the fetch queue
for thread in threads.keys():
fetch_queue.put(item=(None, None))
# Process the results as they arrive
while fetch_queue.qsize() or parse_queue.qsize() or threads:
@ -466,8 +470,19 @@ def spiderPlanet(only_if_new = False):
(uri, feed_info, feed) = parse_queue.get(False)
try:
if int(feed.headers.status) < 300:
data = feedparser.parse(feed)
if not hasattr(feed,'headers') or int(feed.headers.status)<300:
options = {}
if hasattr(feed_info,'feed'):
options['etag'] = \
feed_info.feed.get('planet_http_etag',None)
try:
modified=time.strptime(
feed_info.feed.get('planet_http_last_modified',
None))
except:
pass
data = feedparser.parse(feed, **options)
else:
data = feedparser.FeedParserDict({'version':None,
'headers':feed.headers, 'entries': [],
@ -482,24 +497,9 @@ def spiderPlanet(only_if_new = False):
for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)):
log.error(line.rstrip())
for index in threads.keys():
if not threads[index].isAlive():
del threads[index]
if not threads:
log.info("Finished threaded part of processing.")
# Process non-HTTP uris if we are threading, otherwise process *all* uris here.
unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
for feed in unthreaded_work_queue:
try:
spiderFeed(feed, only_if_new=only_if_new)
except Exception,e:
import sys, traceback
type, value, tb = sys.exc_info()
log.error('Error processing %s', feed)
for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)):
log.error(line.rstrip())

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python
import unittest, os, glob, calendar, shutil, time
from planet.spider import filename, spiderFeed, spiderPlanet
from planet.spider import filename, spiderPlanet, writeCache
from planet import feedparser, config
import planet
@ -43,6 +43,11 @@ class SpiderTest(unittest.TestCase):
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
def spiderFeed(self, feed_uri):
feed_info = feedparser.parse('<feed/>')
data = feedparser.parse(feed_uri)
writeCache(feed_uri, feed_info, data)
def verify_spiderFeed(self):
files = glob.glob(workdir+"/*")
files.sort()
@ -65,13 +70,13 @@ class SpiderTest(unittest.TestCase):
def test_spiderFeed(self):
config.load(configfile)
spiderFeed(testfeed % '1b')
self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed()
def test_spiderUpdate(self):
config.load(configfile)
spiderFeed(testfeed % '1a')
spiderFeed(testfeed % '1b')
self.spiderFeed(testfeed % '1a')
self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed()
def verify_spiderPlanet(self):