Complete HttpThread refactoring

2006-11-21 09:11:52 -05:00 · 2006-11-21 09:11:52 -05:00 · 70f971750b
commit 70f971750b
parent e85ae48722
2 changed files with 132 additions and 127 deletions
--- a/planet/spider.py
+++ b/planet/spider.py
@ -344,11 +344,8 @@ def httpThread(thread_index, input_queue, output_queue, log):
    http_cache = config.http_cache_directory()
    h = httplib2.Http(http_cache)
-    try:
+    uri, feed_info = input_queue.get(block=True)
-        while True:
+    while uri:
            # The non-blocking get will throw an exception when the queue 
            # is empty which will terminate the thread.
            uri, feed_info = input_queue.get(block=False)
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
@ -400,12 +397,11 @@ def httpThread(thread_index, input_queue, output_queue, log):
            continue
        output_queue.put(block=True, item=(uri, feed_info, feed))
- 
+        uri, feed_info = input_queue.get(block=True)
    except Empty, e:
        log.info("Thread %d finished", thread_index)
 def spiderPlanet(only_if_new = False):
    """ Spider (fetch) an entire planet """
    # log = planet.getLogger(config.log_level(),config.log_format())
    log = planet.getLogger(config.log_level(),config.log_format())
    global index
@ -414,6 +410,7 @@ def spiderPlanet(only_if_new = False):
    timeout = config.feed_timeout()
    try:
        socket.setdefaulttimeout(float(timeout))
        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
            from planet import timeoutsocket
@ -422,21 +419,28 @@ def spiderPlanet(only_if_new = False):
        except:
            log.warning("Timeout set to invalid value '%s', skipping", timeout)
    if int(config.spider_threads()):
    from Queue import Queue
    from threading import Thread
    fetch_queue = Queue()
    parse_queue = Queue()
    threads = {}
    if int(config.spider_threads()):
        http_cache = config.http_cache_directory()
        if not os.path.exists(http_cache):
            os.makedirs(http_cache, 0700)
-        # Load the fetch_queue with all the HTTP(S) uris.
+        # Start all the worker threads
        for i in range(int(config.spider_threads())):
            threads[i] = Thread(target=httpThread,
                args=(i,fetch_queue, parse_queue, log))
            threads[i].start()
    else:
        log.info("Building work queue")
    # Load the fetch and parse work queues
    for uri in config.subscriptions():
            if _is_http_uri(uri):
        # read cached feed info
        sources = config.cache_sources_directory()
        feed_source = filename(sources, uri)
@ -449,14 +453,14 @@ def spiderPlanet(only_if_new = False):
            log.info("Feed %s gone", uri)
            continue
-                fetch_queue.put(item=((uri, feed_info)))
+        if threads and _is_http_uri(uri):
            fetch_queue.put(item=(uri, feed_info))
        else:
            parse_queue.put(item=(uri, feed_info, uri))
-        # Start all the worker threads
+    # Mark the end of the fetch queue
-        threads = dict([(i, Thread(target=httpThread,
+    for thread in threads.keys():
-            args=(i,fetch_queue, parse_queue, log)))
+        fetch_queue.put(item=(None, None))
            for i in range(int(config.spider_threads()))])
        for t in threads.itervalues():
            t.start()
    # Process the results as they arrive
    while fetch_queue.qsize() or parse_queue.qsize() or threads:
@ -466,8 +470,19 @@ def spiderPlanet(only_if_new = False):
            (uri, feed_info, feed) = parse_queue.get(False)
            try:
-                    if int(feed.headers.status) < 300:
+                if not hasattr(feed,'headers') or int(feed.headers.status)<300:
-                        data = feedparser.parse(feed)
+                    options = {}
                    if hasattr(feed_info,'feed'):
                        options['etag'] = \
                            feed_info.feed.get('planet_http_etag',None)
                        try:
                            modified=time.strptime(
                                feed_info.feed.get('planet_http_last_modified',
                                None))
                        except:
                            pass
                    data = feedparser.parse(feed, **options)
                else:
                    data = feedparser.FeedParserDict({'version':None,
                        'headers':feed.headers, 'entries': [],
@ -482,24 +497,9 @@ def spiderPlanet(only_if_new = False):
                for line in (traceback.format_exception_only(type, value) +
                    traceback.format_tb(tb)):
                    log.error(line.rstrip())
        for index in threads.keys():
            if not threads[index].isAlive():
                del threads[index]
                if not threads:
                    log.info("Finished threaded part of processing.")
    # Process non-HTTP uris if we are threading, otherwise process *all* uris here.
    unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
    for feed in unthreaded_work_queue:
        try:
            spiderFeed(feed, only_if_new=only_if_new)
        except Exception,e:
            import sys, traceback
            type, value, tb = sys.exc_info()
            log.error('Error processing %s', feed)
            for line in (traceback.format_exception_only(type, value) +
                traceback.format_tb(tb)):
                log.error(line.rstrip())
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python
 import unittest, os, glob, calendar, shutil, time
-from planet.spider import filename, spiderFeed, spiderPlanet
+from planet.spider import filename, spiderPlanet, writeCache
 from planet import feedparser, config
 import planet
@ -43,6 +43,11 @@ class SpiderTest(unittest.TestCase):
        self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
            filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
    def spiderFeed(self, feed_uri):
        feed_info = feedparser.parse('<feed/>')
        data = feedparser.parse(feed_uri)
        writeCache(feed_uri, feed_info, data)
    def verify_spiderFeed(self):
        files = glob.glob(workdir+"/*")
        files.sort()
@ -65,13 +70,13 @@ class SpiderTest(unittest.TestCase):
    def test_spiderFeed(self):
        config.load(configfile)
-        spiderFeed(testfeed % '1b')
+        self.spiderFeed(testfeed % '1b')
        self.verify_spiderFeed()
    def test_spiderUpdate(self):
        config.load(configfile)
-        spiderFeed(testfeed % '1a')
+        self.spiderFeed(testfeed % '1a')
-        spiderFeed(testfeed % '1b')
+        self.spiderFeed(testfeed % '1b')
        self.verify_spiderFeed()
    def verify_spiderPlanet(self):