Complete HttpThread refactoring

2006-11-21 09:11:52 -05:00 · 2006-11-21 09:11:52 -05:00 · 70f971750b
commit 70f971750b
parent e85ae48722
2 changed files with 132 additions and 127 deletions
--- a/planet/spider.py
+++ b/planet/spider.py
@ -344,11 +344,8 @@ def httpThread(thread_index, input_queue, output_queue, log):

    http_cache = config.http_cache_directory()
    h = httplib2.Http(http_cache)
-    try:
-        while True:
-            # The non-blocking get will throw an exception when the queue 
-            # is empty which will terminate the thread.
-            uri, feed_info = input_queue.get(block=False)
+    uri, feed_info = input_queue.get(block=True)
+    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
@ -400,12 +397,11 @@ def httpThread(thread_index, input_queue, output_queue, log):
            continue

        output_queue.put(block=True, item=(uri, feed_info, feed))
- 
-    except Empty, e:
-        log.info("Thread %d finished", thread_index)
+        uri, feed_info = input_queue.get(block=True)

 def spiderPlanet(only_if_new = False):
    """ Spider (fetch) an entire planet """
+    # log = planet.getLogger(config.log_level(),config.log_format())
    log = planet.getLogger(config.log_level(),config.log_format())

    global index
@ -414,6 +410,7 @@ def spiderPlanet(only_if_new = False):
    timeout = config.feed_timeout()
    try:
        socket.setdefaulttimeout(float(timeout))
+        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
            from planet import timeoutsocket
@ -422,21 +419,28 @@ def spiderPlanet(only_if_new = False):
        except:
            log.warning("Timeout set to invalid value '%s', skipping", timeout)

-    if int(config.spider_threads()):
    from Queue import Queue
    from threading import Thread

    fetch_queue = Queue()
    parse_queue = Queue()

+    threads = {}
+    if int(config.spider_threads()):
        http_cache = config.http_cache_directory()
        if not os.path.exists(http_cache):
            os.makedirs(http_cache, 0700)

-        # Load the fetch_queue with all the HTTP(S) uris.
+        # Start all the worker threads
+        for i in range(int(config.spider_threads())):
+            threads[i] = Thread(target=httpThread,
+                args=(i,fetch_queue, parse_queue, log))
+            threads[i].start()
+    else:
        log.info("Building work queue")
+
+    # Load the fetch and parse work queues
    for uri in config.subscriptions():
-            if _is_http_uri(uri):
        # read cached feed info
        sources = config.cache_sources_directory()
        feed_source = filename(sources, uri)
@ -449,14 +453,14 @@ def spiderPlanet(only_if_new = False):
            log.info("Feed %s gone", uri)
            continue

-                fetch_queue.put(item=((uri, feed_info)))
+        if threads and _is_http_uri(uri):
+            fetch_queue.put(item=(uri, feed_info))
+        else:
+            parse_queue.put(item=(uri, feed_info, uri))

-        # Start all the worker threads
-        threads = dict([(i, Thread(target=httpThread,
-            args=(i,fetch_queue, parse_queue, log)))
-            for i in range(int(config.spider_threads()))])
-        for t in threads.itervalues():
-            t.start()
+    # Mark the end of the fetch queue
+    for thread in threads.keys():
+        fetch_queue.put(item=(None, None))

    # Process the results as they arrive
    while fetch_queue.qsize() or parse_queue.qsize() or threads:
@ -466,8 +470,19 @@ def spiderPlanet(only_if_new = False):
            (uri, feed_info, feed) = parse_queue.get(False)
            try:

-                    if int(feed.headers.status) < 300:
-                        data = feedparser.parse(feed)
+                if not hasattr(feed,'headers') or int(feed.headers.status)<300:
+                    options = {}
+                    if hasattr(feed_info,'feed'):
+                        options['etag'] = \
+                            feed_info.feed.get('planet_http_etag',None)
+                        try:
+                            modified=time.strptime(
+                                feed_info.feed.get('planet_http_last_modified',
+                                None))
+                        except:
+                            pass
+
+                    data = feedparser.parse(feed, **options)
                else:
                    data = feedparser.FeedParserDict({'version':None,
                        'headers':feed.headers, 'entries': [],
@ -482,24 +497,9 @@ def spiderPlanet(only_if_new = False):
                for line in (traceback.format_exception_only(type, value) +
                    traceback.format_tb(tb)):
                    log.error(line.rstrip())
+
        for index in threads.keys():
            if not threads[index].isAlive():
                del threads[index]
+                if not threads:
                    log.info("Finished threaded part of processing.")
-                    
-
-    # Process non-HTTP uris if we are threading, otherwise process *all* uris here.
-    unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
-    for feed in unthreaded_work_queue:
-        try:
-            spiderFeed(feed, only_if_new=only_if_new)
-        except Exception,e:
-            import sys, traceback
-            type, value, tb = sys.exc_info()
-            log.error('Error processing %s', feed)
-            for line in (traceback.format_exception_only(type, value) +
-                traceback.format_tb(tb)):
-                log.error(line.rstrip())
-
-
-
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python

 import unittest, os, glob, calendar, shutil, time
-from planet.spider import filename, spiderFeed, spiderPlanet
+from planet.spider import filename, spiderPlanet, writeCache
 from planet import feedparser, config
 import planet

@ -43,6 +43,11 @@ class SpiderTest(unittest.TestCase):
        self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
            filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))

+    def spiderFeed(self, feed_uri):
+        feed_info = feedparser.parse('<feed/>')
+        data = feedparser.parse(feed_uri)
+        writeCache(feed_uri, feed_info, data)
+
    def verify_spiderFeed(self):
        files = glob.glob(workdir+"/*")
        files.sort()
@ -65,13 +70,13 @@ class SpiderTest(unittest.TestCase):

    def test_spiderFeed(self):
        config.load(configfile)
-        spiderFeed(testfeed % '1b')
+        self.spiderFeed(testfeed % '1b')
        self.verify_spiderFeed()

    def test_spiderUpdate(self):
        config.load(configfile)
-        spiderFeed(testfeed % '1a')
-        spiderFeed(testfeed % '1b')
+        self.spiderFeed(testfeed % '1a')
+        self.spiderFeed(testfeed % '1b')
        self.verify_spiderFeed()

    def verify_spiderPlanet(self):