Complete HttpThread refactoring
This commit is contained in:
parent
e85ae48722
commit
70f971750b
@ -344,11 +344,8 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
|||||||
|
|
||||||
http_cache = config.http_cache_directory()
|
http_cache = config.http_cache_directory()
|
||||||
h = httplib2.Http(http_cache)
|
h = httplib2.Http(http_cache)
|
||||||
try:
|
uri, feed_info = input_queue.get(block=True)
|
||||||
while True:
|
while uri:
|
||||||
# The non-blocking get will throw an exception when the queue
|
|
||||||
# is empty which will terminate the thread.
|
|
||||||
uri, feed_info = input_queue.get(block=False)
|
|
||||||
log.info("Fetching %s via %d", uri, thread_index)
|
log.info("Fetching %s via %d", uri, thread_index)
|
||||||
feed = StringIO('')
|
feed = StringIO('')
|
||||||
setattr(feed, 'url', uri)
|
setattr(feed, 'url', uri)
|
||||||
@ -400,12 +397,11 @@ def httpThread(thread_index, input_queue, output_queue, log):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
output_queue.put(block=True, item=(uri, feed_info, feed))
|
output_queue.put(block=True, item=(uri, feed_info, feed))
|
||||||
|
uri, feed_info = input_queue.get(block=True)
|
||||||
except Empty, e:
|
|
||||||
log.info("Thread %d finished", thread_index)
|
|
||||||
|
|
||||||
def spiderPlanet(only_if_new = False):
|
def spiderPlanet(only_if_new = False):
|
||||||
""" Spider (fetch) an entire planet """
|
""" Spider (fetch) an entire planet """
|
||||||
|
# log = planet.getLogger(config.log_level(),config.log_format())
|
||||||
log = planet.getLogger(config.log_level(),config.log_format())
|
log = planet.getLogger(config.log_level(),config.log_format())
|
||||||
|
|
||||||
global index
|
global index
|
||||||
@ -414,6 +410,7 @@ def spiderPlanet(only_if_new = False):
|
|||||||
timeout = config.feed_timeout()
|
timeout = config.feed_timeout()
|
||||||
try:
|
try:
|
||||||
socket.setdefaulttimeout(float(timeout))
|
socket.setdefaulttimeout(float(timeout))
|
||||||
|
log.info("Socket timeout set to %d seconds", timeout)
|
||||||
except:
|
except:
|
||||||
try:
|
try:
|
||||||
from planet import timeoutsocket
|
from planet import timeoutsocket
|
||||||
@ -422,21 +419,28 @@ def spiderPlanet(only_if_new = False):
|
|||||||
except:
|
except:
|
||||||
log.warning("Timeout set to invalid value '%s', skipping", timeout)
|
log.warning("Timeout set to invalid value '%s', skipping", timeout)
|
||||||
|
|
||||||
if int(config.spider_threads()):
|
|
||||||
from Queue import Queue
|
from Queue import Queue
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
fetch_queue = Queue()
|
fetch_queue = Queue()
|
||||||
parse_queue = Queue()
|
parse_queue = Queue()
|
||||||
|
|
||||||
|
threads = {}
|
||||||
|
if int(config.spider_threads()):
|
||||||
http_cache = config.http_cache_directory()
|
http_cache = config.http_cache_directory()
|
||||||
if not os.path.exists(http_cache):
|
if not os.path.exists(http_cache):
|
||||||
os.makedirs(http_cache, 0700)
|
os.makedirs(http_cache, 0700)
|
||||||
|
|
||||||
# Load the fetch_queue with all the HTTP(S) uris.
|
# Start all the worker threads
|
||||||
|
for i in range(int(config.spider_threads())):
|
||||||
|
threads[i] = Thread(target=httpThread,
|
||||||
|
args=(i,fetch_queue, parse_queue, log))
|
||||||
|
threads[i].start()
|
||||||
|
else:
|
||||||
log.info("Building work queue")
|
log.info("Building work queue")
|
||||||
|
|
||||||
|
# Load the fetch and parse work queues
|
||||||
for uri in config.subscriptions():
|
for uri in config.subscriptions():
|
||||||
if _is_http_uri(uri):
|
|
||||||
# read cached feed info
|
# read cached feed info
|
||||||
sources = config.cache_sources_directory()
|
sources = config.cache_sources_directory()
|
||||||
feed_source = filename(sources, uri)
|
feed_source = filename(sources, uri)
|
||||||
@ -449,14 +453,14 @@ def spiderPlanet(only_if_new = False):
|
|||||||
log.info("Feed %s gone", uri)
|
log.info("Feed %s gone", uri)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
fetch_queue.put(item=((uri, feed_info)))
|
if threads and _is_http_uri(uri):
|
||||||
|
fetch_queue.put(item=(uri, feed_info))
|
||||||
|
else:
|
||||||
|
parse_queue.put(item=(uri, feed_info, uri))
|
||||||
|
|
||||||
# Start all the worker threads
|
# Mark the end of the fetch queue
|
||||||
threads = dict([(i, Thread(target=httpThread,
|
for thread in threads.keys():
|
||||||
args=(i,fetch_queue, parse_queue, log)))
|
fetch_queue.put(item=(None, None))
|
||||||
for i in range(int(config.spider_threads()))])
|
|
||||||
for t in threads.itervalues():
|
|
||||||
t.start()
|
|
||||||
|
|
||||||
# Process the results as they arrive
|
# Process the results as they arrive
|
||||||
while fetch_queue.qsize() or parse_queue.qsize() or threads:
|
while fetch_queue.qsize() or parse_queue.qsize() or threads:
|
||||||
@ -466,8 +470,19 @@ def spiderPlanet(only_if_new = False):
|
|||||||
(uri, feed_info, feed) = parse_queue.get(False)
|
(uri, feed_info, feed) = parse_queue.get(False)
|
||||||
try:
|
try:
|
||||||
|
|
||||||
if int(feed.headers.status) < 300:
|
if not hasattr(feed,'headers') or int(feed.headers.status)<300:
|
||||||
data = feedparser.parse(feed)
|
options = {}
|
||||||
|
if hasattr(feed_info,'feed'):
|
||||||
|
options['etag'] = \
|
||||||
|
feed_info.feed.get('planet_http_etag',None)
|
||||||
|
try:
|
||||||
|
modified=time.strptime(
|
||||||
|
feed_info.feed.get('planet_http_last_modified',
|
||||||
|
None))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
data = feedparser.parse(feed, **options)
|
||||||
else:
|
else:
|
||||||
data = feedparser.FeedParserDict({'version':None,
|
data = feedparser.FeedParserDict({'version':None,
|
||||||
'headers':feed.headers, 'entries': [],
|
'headers':feed.headers, 'entries': [],
|
||||||
@ -482,24 +497,9 @@ def spiderPlanet(only_if_new = False):
|
|||||||
for line in (traceback.format_exception_only(type, value) +
|
for line in (traceback.format_exception_only(type, value) +
|
||||||
traceback.format_tb(tb)):
|
traceback.format_tb(tb)):
|
||||||
log.error(line.rstrip())
|
log.error(line.rstrip())
|
||||||
|
|
||||||
for index in threads.keys():
|
for index in threads.keys():
|
||||||
if not threads[index].isAlive():
|
if not threads[index].isAlive():
|
||||||
del threads[index]
|
del threads[index]
|
||||||
|
if not threads:
|
||||||
log.info("Finished threaded part of processing.")
|
log.info("Finished threaded part of processing.")
|
||||||
|
|
||||||
|
|
||||||
# Process non-HTTP uris if we are threading, otherwise process *all* uris here.
|
|
||||||
unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
|
|
||||||
for feed in unthreaded_work_queue:
|
|
||||||
try:
|
|
||||||
spiderFeed(feed, only_if_new=only_if_new)
|
|
||||||
except Exception,e:
|
|
||||||
import sys, traceback
|
|
||||||
type, value, tb = sys.exc_info()
|
|
||||||
log.error('Error processing %s', feed)
|
|
||||||
for line in (traceback.format_exception_only(type, value) +
|
|
||||||
traceback.format_tb(tb)):
|
|
||||||
log.error(line.rstrip())
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
import unittest, os, glob, calendar, shutil, time
|
import unittest, os, glob, calendar, shutil, time
|
||||||
from planet.spider import filename, spiderFeed, spiderPlanet
|
from planet.spider import filename, spiderPlanet, writeCache
|
||||||
from planet import feedparser, config
|
from planet import feedparser, config
|
||||||
import planet
|
import planet
|
||||||
|
|
||||||
@ -43,6 +43,11 @@ class SpiderTest(unittest.TestCase):
|
|||||||
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
|
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
|
||||||
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
|
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
|
||||||
|
|
||||||
|
def spiderFeed(self, feed_uri):
|
||||||
|
feed_info = feedparser.parse('<feed/>')
|
||||||
|
data = feedparser.parse(feed_uri)
|
||||||
|
writeCache(feed_uri, feed_info, data)
|
||||||
|
|
||||||
def verify_spiderFeed(self):
|
def verify_spiderFeed(self):
|
||||||
files = glob.glob(workdir+"/*")
|
files = glob.glob(workdir+"/*")
|
||||||
files.sort()
|
files.sort()
|
||||||
@ -65,13 +70,13 @@ class SpiderTest(unittest.TestCase):
|
|||||||
|
|
||||||
def test_spiderFeed(self):
|
def test_spiderFeed(self):
|
||||||
config.load(configfile)
|
config.load(configfile)
|
||||||
spiderFeed(testfeed % '1b')
|
self.spiderFeed(testfeed % '1b')
|
||||||
self.verify_spiderFeed()
|
self.verify_spiderFeed()
|
||||||
|
|
||||||
def test_spiderUpdate(self):
|
def test_spiderUpdate(self):
|
||||||
config.load(configfile)
|
config.load(configfile)
|
||||||
spiderFeed(testfeed % '1a')
|
self.spiderFeed(testfeed % '1a')
|
||||||
spiderFeed(testfeed % '1b')
|
self.spiderFeed(testfeed % '1b')
|
||||||
self.verify_spiderFeed()
|
self.verify_spiderFeed()
|
||||||
|
|
||||||
def verify_spiderPlanet(self):
|
def verify_spiderPlanet(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user