Spider threads

2006-11-14 14:08:15 -05:00 · 2006-11-14 14:08:15 -05:00 · 6ebbed2ab7
commit 6ebbed2ab7
parent ba25b691ff 45f0f92110
10 changed files with 1053 additions and 47 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ Elias Torres    - FOAF OnlineAccounts
 Jacques Distler - Template patches
 Michael Koziarski - HTTP Auth fix
 Brian Ewins     - Win32 / Portalocker
-Joe Gregorio    - Invoke same version of Python for filters
+Joe Gregorio    - python versioning for filters, verbose tests, spider_threads
 Harry Fuecks    - Pipe characters in file names, filter bug
 Eric van der Vlist - Filters to add language, category information
 Chris Dolan     - mkdir cache; default template_dirs; fix xsltproc
--- a/docs/config.html
+++ b/docs/config.html
@ -98,6 +98,9 @@ use for logging output.  Note: this configuration value is processed
 <dd>Number of seconds to wait for any given feed</dd>
 <dt><del>new_feed_items</del></dt>
 <dd>Number of items to take from new feeds</dd>
+<dt><ins>spider_threads</ins></dt>
+<dd>The number of threads to use when spidering. When set to 0, the default, 
+   no threads are used and spidering follows the traditional algorithm.</dd>
 </dl>
 </blockquote>

--- a/planet.py
+++ b/planet.py
@ -54,7 +54,10 @@ if __name__ == "__main__":

    if not offline:
        from planet import spider
+        try:
            spider.spiderPlanet(only_if_new=only_if_new)
+        except Exception, e:
+            print e

    from planet import splice
    doc = splice.splice()
--- a/planet/init.py
+++ b/planet/init.py
@ -31,25 +31,4 @@ def getLogger(level, format):
    return logger


-def setTimeout(timeout):
-    """ time out rather than hang forever on ultra-slow servers."""
-    if timeout:
-        try:
-            timeout = float(timeout)
-        except:
-            logger.warning("Timeout set to invalid value '%s', skipping", timeout)
-            timeout = None

-    if timeout:
-        try:
-            from planet import timeoutsocket
-            timeoutsocket.setDefaultSocketTimeout(timeout)
-            logger.info("Socket timeout set to %d seconds", timeout)
-        except ImportError:
-            import socket
-            if hasattr(socket, 'setdefaulttimeout'):
-                logger.debug("timeoutsocket not found, using python function")
-                socket.setdefaulttimeout(timeout)
-                logger.info("Socket timeout set to %d seconds", timeout)
-            else:
-                logger.error("Unable to set timeout to %d seconds", timeout)
--- a/planet/config.py
+++ b/planet/config.py
@ -100,6 +100,7 @@ def __init__():
    define_planet('owner_email', '')
    define_planet('output_theme', '')
    define_planet('output_dir', 'output')
+    define_planet('spider_threads', 0) 

    define_planet_list('template_files')
    define_planet_list('bill_of_materials')
@ -282,6 +283,11 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
        except:
            logger.exception("Unable to read %s readinglist", list)

+def http_cache_directory():
+    if parser.has_option('Planet', 'http_cache_directory'):
+        parser.get('Planet', 'http_cache_directory')
+    else:
+        return os.path.join(cache_directory(), 'sources/http')

 def cache_sources_directory():
    if parser.has_option('Planet', 'cache_sources_directory'):
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """

-__version__ = "4.2-pre-" + "$Revision: 1.144 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.146 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/planet/httplib2/init.py
+++ b/planet/httplib2/init.py
@ -0,0 +1,842 @@
+"""
+httplib2
+
+A caching http interface that supports ETags and gzip
+to conserve bandwidth. 
+
+Requires Python 2.3 or later
+
+"""
+
+from __future__ import generators
+
+__author__ = "Joe Gregorio (joe@bitworking.org)"
+__copyright__ = "Copyright 2006, Joe Gregorio"
+__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
+    "James Antill",
+    "Xavier Verges Farrero",
+    "Jonathan Feinberg",
+    "Blair Zajac"]
+__license__ = "MIT"
+__version__ = "$Rev: 209 $"
+
+import re 
+import md5
+import rfc822
+import StringIO
+import gzip
+import zlib
+import httplib
+import urlparse
+import base64
+import os
+import copy
+import calendar
+import time
+import random
+import sha
+import hmac
+from gettext import gettext as _
+from socket import gaierror
+
+__all__ = ['Http', 'Response', 'HttpLib2Error',
+  'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent', 
+  'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
+  'debuglevel']
+
+
+# The httplib debug level, set to a non-zero value to get debug output
+debuglevel = 0
+
+# Python 2.3 support
+if 'sorted' not in __builtins__:
+    def sorted(seq):
+        seq.sort()
+        return seq
+
+# Python 2.3 support
+def HTTPResponse__getheaders(self):
+    """Return list of (header, value) tuples."""
+    if self.msg is None:
+        print "================================"
+        raise httplib.ResponseNotReady()
+    return self.msg.items()
+
+if not hasattr(httplib.HTTPResponse, 'getheaders'):
+    httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
+
+# All exceptions raised here derive from HttpLib2Error
+class HttpLib2Error(Exception): pass
+
+class RedirectMissingLocation(HttpLib2Error): pass
+class RedirectLimit(HttpLib2Error): pass
+class FailedToDecompressContent(HttpLib2Error): pass
+class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
+class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
+
+# Open Items:
+# -----------
+# Proxy support
+
+# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
+
+# Pluggable cache storage (supports storing the cache in
+#   flat files by default. We need a plug-in architecture
+#   that can support Berkeley DB and Squid)
+
+# == Known Issues ==
+# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
+# Does not handle Cache-Control: max-stale
+# Does not use Age: headers when calculating cache freshness.
+
+
+# The number of redirections to follow before giving up.
+# Note that only GET redirects are automatically followed.
+# Will also honor 301 requests by saving that info and never
+# requesting that URI again.
+DEFAULT_MAX_REDIRECTS = 5
+
+# Which headers are hop-by-hop headers by default
+HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
+
+def _get_end2end_headers(response):
+    hopbyhop = list(HOP_BY_HOP)
+    hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
+    return [header for header in response.keys() if header not in hopbyhop]
+
+URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
+
+def parse_uri(uri):
+    """Parses a URI using the regex given in Appendix B of RFC 3986.
+
+        (scheme, authority, path, query, fragment) = parse_uri(uri)
+    """
+    groups = URI.match(uri).groups()
+    return (groups[1], groups[3], groups[4], groups[6], groups[8])
+
+NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
+def _normalize_headers(headers):
+    return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip())  for (key, value) in headers.iteritems()])
+
+def _parse_cache_control(headers):
+    retval = {}
+    if headers.has_key('cache-control'):
+        parts =  headers['cache-control'].split(',')
+        parts_with_args = [tuple([x.strip() for x in part.split("=")]) for part in parts if -1 != part.find("=")]
+        parts_wo_args = [(name.strip(), 1) for name in parts if -1 == name.find("=")]
+        retval = dict(parts_with_args + parts_wo_args)
+    return retval 
+
+# Whether to use a strict mode to parse WWW-Authenticate headers
+# Might lead to bad results in case of ill-formed header value,
+# so disabled by default, falling back to relaxed parsing.
+# Set to true to turn on, usefull for testing servers.
+USE_WWW_AUTH_STRICT_PARSING = 0
+
+# In regex below:
+#    [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+             matches a "token" as defined by HTTP
+#    "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?"    matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
+# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
+#    \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
+WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
+WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
+UNQUOTE_PAIRS = re.compile(r'\\(.)')
+def _parse_www_authenticate(headers, headername='www-authenticate'):
+    """Returns a dictionary of dictionaries, one dict
+    per auth_scheme."""
+    retval = {}
+    if headers.has_key(headername):
+        authenticate = headers[headername].strip()
+        www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
+        while authenticate:
+            # Break off the scheme at the beginning of the line
+            if headername == 'authentication-info':
+                (auth_scheme, the_rest) = ('digest', authenticate)                
+            else:
+                (auth_scheme, the_rest) = authenticate.split(" ", 1)
+            # Now loop over all the key value pairs that come after the scheme, 
+            # being careful not to roll into the next scheme
+            match = www_auth.search(the_rest)
+            auth_params = {}
+            while match:
+                if match and len(match.groups()) == 3:
+                    (key, value, the_rest) = match.groups()
+                    auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
+                match = www_auth.search(the_rest)
+            retval[auth_scheme.lower()] = auth_params
+            authenticate = the_rest.strip()
+    return retval
+
+
+def _entry_disposition(response_headers, request_headers):
+    """Determine freshness from the Date, Expires and Cache-Control headers.
+
+    We don't handle the following:
+
+    1. Cache-Control: max-stale
+    2. Age: headers are not used in the calculations.
+
+    Not that this algorithm is simpler than you might think 
+    because we are operating as a private (non-shared) cache.
+    This lets us ignore 's-maxage'. We can also ignore
+    'proxy-invalidate' since we aren't a proxy.
+    We will never return a stale document as 
+    fresh as a design decision, and thus the non-implementation 
+    of 'max-stale'. This also lets us safely ignore 'must-revalidate' 
+    since we operate as if every server has sent 'must-revalidate'.
+    Since we are private we get to ignore both 'public' and
+    'private' parameters. We also ignore 'no-transform' since
+    we don't do any transformations.    
+    The 'no-store' parameter is handled at a higher level.
+    So the only Cache-Control parameters we look at are:
+
+    no-cache
+    only-if-cached
+    max-age
+    min-fresh
+    """
+    
+    retval = "STALE"
+    cc = _parse_cache_control(request_headers)
+    cc_response = _parse_cache_control(response_headers)
+
+    if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
+        retval = "TRANSPARENT"
+        if 'cache-control' not in request_headers:
+            request_headers['cache-control'] = 'no-cache'
+    elif cc.has_key('no-cache'):
+        retval = "TRANSPARENT"
+    elif cc_response.has_key('no-cache'):
+        retval = "STALE"
+    elif cc.has_key('only-if-cached'):
+        retval = "FRESH"
+    elif response_headers.has_key('date'):
+        date = calendar.timegm(rfc822.parsedate_tz(response_headers['date']))
+        now = time.time()
+        current_age = max(0, now - date)
+        if cc_response.has_key('max-age'):
+            freshness_lifetime = int(cc_response['max-age'])
+        elif response_headers.has_key('expires'):
+            expires = rfc822.parsedate_tz(response_headers['expires'])
+            freshness_lifetime = max(0, calendar.timegm(expires) - date)
+        else:
+            freshness_lifetime = 0
+        if cc.has_key('max-age'):
+            freshness_lifetime = min(freshness_lifetime, int(cc['max-age']))
+        if cc.has_key('min-fresh'):
+            current_age += int(cc['min-fresh'])
+        if freshness_lifetime > current_age:
+            retval = "FRESH"
+    return retval 
+
+def _decompressContent(response, new_content):
+    content = new_content
+    try:
+        if response.get('content-encoding', None) == 'gzip':
+            content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
+            response['content-length'] = str(len(content))
+        if response.get('content-encoding', None) == 'deflate':
+            content = zlib.decompress(content)
+            response['content-length'] = str(len(content))
+    except:
+        content = ""
+        raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'))
+    return content
+
+def _updateCache(request_headers, response_headers, content, cache, cachekey):
+    if cachekey:
+        cc = _parse_cache_control(request_headers)
+        cc_response = _parse_cache_control(response_headers)
+        if cc.has_key('no-store') or cc_response.has_key('no-store'):
+            cache.delete(cachekey)
+        else:
+            f = StringIO.StringIO("")
+            info = rfc822.Message(StringIO.StringIO(""))
+            for key, value in response_headers.iteritems():
+                info[key] = value
+            f.write(str(info))
+            f.write("\r\n\r\n")
+            f.write(content)
+            cache.set(cachekey, f.getvalue())
+
+def _cnonce():
+    dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
+    return dig[:16]
+
+def _wsse_username_token(cnonce, iso_now, password):
+    return base64.encodestring(sha.new("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
+
+
+# For credentials we need two things, first 
+# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
+# Then we also need a list of URIs that have already demanded authentication
+# That list is tricky since sub-URIs can take the same auth, or the 
+# auth scheme may change as you descend the tree.
+# So we also need each Auth instance to be able to tell us
+# how close to the 'top' it is.
+
+class Authentication:
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
+        self.path = path
+        self.host = host
+        self.credentials = credentials
+        self.http = http
+
+    def depth(self, request_uri):
+        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
+        return request_uri[len(self.path):].count("/")
+
+    def inscope(self, host, request_uri):
+        # XXX Should we normalize the request_uri?
+        (scheme, authority, path, query, fragment) = parse_uri(request_uri)
+        return (host == self.host) and path.startswith(self.path)
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers to add the appropriate
+        Authorization header. Over-rise this in sub-classes."""
+        pass
+
+    def response(self, response, content):
+        """Gives us a chance to update with new nonces
+        or such returned from the last authorized response.
+        Over-rise this in sub-classes if necessary.
+
+        Return TRUE is the request is to be retried, for 
+        example Digest may return stale=true.
+        """
+        return False
+
+
+
+class BasicAuthentication(Authentication):
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers to add the appropriate
+        Authorization header."""
+        headers['authorization'] = 'Basic ' + base64.encodestring("%s:%s" % self.credentials).strip()  
+
+
+class DigestAuthentication(Authentication):
+    """Only do qop='auth' and MD5, since that 
+    is all Apache currently implements"""
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+        challenge = _parse_www_authenticate(response, 'www-authenticate')
+        self.challenge = challenge['digest']
+        qop = self.challenge.get('qop')
+        self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
+        if self.challenge['qop'] is None:
+            raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
+        self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5')
+        if self.challenge['algorithm'] != 'MD5':
+            raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
+        self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])   
+        self.challenge['nc'] = 1
+
+    def request(self, method, request_uri, headers, content, cnonce = None):
+        """Modify the request headers"""
+        H = lambda x: md5.new(x).hexdigest()
+        KD = lambda s, d: H("%s:%s" % (s, d))
+        A2 = "".join([method, ":", request_uri])
+        self.challenge['cnonce'] = cnonce or _cnonce() 
+        request_digest  = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'], 
+                    '%08x' % self.challenge['nc'], 
+                    self.challenge['cnonce'], 
+                    self.challenge['qop'], H(A2)
+                    )) 
+        headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
+                self.credentials[0], 
+                self.challenge['realm'],
+                self.challenge['nonce'],
+                request_uri, 
+                self.challenge['algorithm'],
+                request_digest,
+                self.challenge['qop'],
+                self.challenge['nc'],
+                self.challenge['cnonce'],
+                )
+        self.challenge['nc'] += 1
+
+    def response(self, response, content):
+        if not response.has_key('authentication-info'):
+            challenge = _parse_www_authenticate(response, 'www-authenticate')['digest']
+            if 'true' == challenge.get('stale'):
+                self.challenge['nonce'] = challenge['nonce']
+                self.challenge['nc'] = 1 
+                return True
+        else:
+            updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest']
+
+            if updated_challenge.has_key('nextnonce'):
+                self.challenge['nonce'] = updated_challenge['nextnonce']
+                self.challenge['nc'] = 1 
+        return False
+
+
+class HmacDigestAuthentication(Authentication):
+    """Adapted from Robert Sayre's code and DigestAuthentication above."""
+    __author__ = "Thomas Broyer (t.broyer@ltgt.net)"
+
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+        challenge = _parse_www_authenticate(response, 'www-authenticate')
+        self.challenge = challenge['hmacdigest']
+        print self.challenge
+        # TODO: self.challenge['domain']
+        self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
+        if self.challenge['reason'] not in ['unauthorized', 'integrity']:
+            self.challenge['reason'] = 'unauthorized'
+        self.challenge['salt'] = self.challenge.get('salt', '')
+        if not self.challenge.get('snonce'):
+            raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
+        self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
+        if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
+            raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
+        self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
+        if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
+            raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
+        if self.challenge['algorithm'] == 'HMAC-MD5':
+            self.hashmod = md5
+        else:
+            self.hashmod = sha
+        if self.challenge['pw-algorithm'] == 'MD5':
+            self.pwhashmod = md5
+        else:
+            self.pwhashmod = sha
+        self.key = "".join([self.credentials[0], ":",
+                    self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
+                    ":", self.challenge['realm']
+                    ])
+        print response['www-authenticate']
+        print "".join([self.credentials[1], self.challenge['salt']])
+        print "key_str = %s" % self.key
+        self.key = self.pwhashmod.new(self.key).hexdigest().lower()
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers"""
+        keys = _get_end2end_headers(headers)
+        keylist = "".join(["%s " % k for k in keys])
+        headers_val = "".join([headers[k] for k in keys])
+        created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
+        cnonce = _cnonce()
+        request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
+        print "key = %s" % self.key
+        print "msg = %s" % request_digest
+        request_digest  = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
+        headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
+                self.credentials[0], 
+                self.challenge['realm'],
+                self.challenge['snonce'],
+                cnonce,
+                request_uri, 
+                created,
+                request_digest,
+                keylist,
+                )
+
+    def response(self, response, content):
+        challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
+        if challenge.get('reason') in ['integrity', 'stale']:
+            return True
+        return False
+
+
+class WsseAuthentication(Authentication):
+    """This is thinly tested and should not be relied upon.
+    At this time there isn't any third party server to test against.
+    Blogger and TypePad implemented this algorithm at one point
+    but Blogger has since switched to Basic over HTTPS and 
+    TypePad has implemented it wrong, by never issuing a 401
+    challenge but instead requiring your client to telepathically know that
+    their endpoint is expecting WSSE profile="UsernameToken"."""
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers to add the appropriate
+        Authorization header."""
+        headers['Authorization'] = 'WSSE profile="UsernameToken"'
+        iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+        cnonce = _cnonce()
+        password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
+        headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
+                self.credentials[0],
+                password_digest,
+                cnonce,
+                iso_now)
+
+class GoogleLoginAuthentication(Authentication):
+    def __init__(self, credentials, host, request_uri, headers, response, content, http):
+        from urllib import urlencode
+        Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
+
+        auth = dict(Email=credentials[0], Passwd=credentials[1], service='cl', source=headers['user-agent'])
+        resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
+        lines = content.split('\n')
+        d = dict([tuple(line.split("=", 1)) for line in lines if line])
+        if resp.status == 403:
+            self.Auth = ""
+        else:
+            self.Auth = d['Auth']
+
+    def request(self, method, request_uri, headers, content):
+        """Modify the request headers to add the appropriate
+        Authorization header."""
+        headers['authorization'] = 'GoogleLogin Auth=' + self.Auth 
+
+
+AUTH_SCHEME_CLASSES = {
+    "basic": BasicAuthentication,
+    "wsse": WsseAuthentication,
+    "digest": DigestAuthentication,
+    "hmacdigest": HmacDigestAuthentication,
+    "googlelogin": GoogleLoginAuthentication
+}
+
+AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
+
+
+class FileCache:
+    """Uses a local directory as a store for cached files.
+    Not really safe to use if multiple threads or processes are going to 
+    be running on the same cache.
+    """
+    def __init__(self, cache):
+        self.cache = cache
+        if not os.path.exists(cache): 
+            os.makedirs(self.cache)
+
+    def get(self, key):
+        retval = None
+        cacheFullPath = os.path.join(self.cache, key)
+        try:
+            f = file(cacheFullPath, "r")
+            retval = f.read()
+            f.close()
+        except:
+            pass
+        return retval
+
+    def set(self, key, value):
+        cacheFullPath = os.path.join(self.cache, key)
+        f = file(cacheFullPath, "w")
+        f.write(value)
+        f.close()
+
+    def delete(self, key):
+        cacheFullPath = os.path.join(self.cache, key)
+        if os.path.exists(cacheFullPath):
+            os.remove(cacheFullPath)
+
+class Http:
+    """An HTTP client that handles all 
+    methods, caching, ETags, compression,
+    HTTPS, Basic, Digest, WSSE, etc.
+    """
+    def __init__(self, cache=None):
+        # Map domain name to an httplib connection
+        self.connections = {}
+        # The location of the cache, for now a directory
+        # where cached responses are held.
+        if cache and isinstance(cache, str):
+            self.cache = FileCache(cache)
+        else:
+            self.cache = cache
+
+        # tuples of name, password
+        self.credentials = []
+
+        # authorization objects
+        self.authorizations = []
+
+        self.follow_all_redirects = False
+
+        self.ignore_etag = False
+
+    def _auth_from_challenge(self, host, request_uri, headers, response, content):
+        """A generator that creates Authorization objects
+           that can be applied to requests.
+        """
+        challenges = _parse_www_authenticate(response, 'www-authenticate')
+        for cred in self.credentials:
+            for scheme in AUTH_SCHEME_ORDER:
+                if challenges.has_key(scheme):
+                    yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self) 
+
+    def add_credentials(self, name, password):
+        """Add a name and password that will be used
+        any time a request requires authentication."""
+        self.credentials.append((name, password))
+
+    def clear_credentials(self):
+        """Remove all the names and passwords
+        that are used for authentication"""
+        self.credentials = []
+        self.authorizations = []
+
+    def _conn_request(self, conn, request_uri, method, body, headers):
+        for i in range(2):
+            try:
+                conn.request(method, request_uri, body, headers)
+                response = conn.getresponse()
+            except:
+                if i == 0:
+                    conn.close()
+                    conn.connect()
+                    continue
+                else:
+                    raise
+            else:
+                content = response.read()
+                response = Response(response)
+                content = _decompressContent(response, content)
+
+            break;
+        return (response, content)
+
+
+    def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
+        """Do the actual request using the connection object
+        and also follow one level of redirects if necessary"""
+
+        auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
+        auth = auths and sorted(auths)[0][1] or None
+        if auth: 
+            auth.request(method, request_uri, headers, body)
+
+        (response, content) = self._conn_request(conn, request_uri, method, body, headers)
+
+        if auth: 
+            if auth.response(response, body):
+                auth.request(method, request_uri, headers, body)
+                (response, content) = self._conn_request(conn, request_uri, method, body, headers )
+                response._stale_digest = 1
+
+        if response.status == 401:
+            for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
+                authorization.request(method, request_uri, headers, body) 
+                (response, content) = self._conn_request(conn, request_uri, method, body, headers, )
+                if response.status != 401:
+                    self.authorizations.append(authorization)
+                    authorization.response(response, body)
+                    break
+
+        if (self.follow_all_redirects or method in ["GET", "HEAD"]) or response.status == 303:
+            if response.status in [300, 301, 302, 303, 307]:
+                # Pick out the location header and basically start from the beginning
+                # remembering first to strip the ETag header and decrement our 'depth'
+                if redirections:
+                    if not response.has_key('location') and response.status != 300:
+                        raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."))
+                    # Fix-up relative redirects (which violate an RFC 2616 MUST)
+                    if response.has_key('location'):
+                        location = response['location']
+                        (scheme, authority, path, query, fragment) = parse_uri(location)
+                        if authority == None:
+                            response['location'] = urlparse.urljoin(absolute_uri, location)
+                    if response.status == 301 and method in ["GET", "HEAD"]:
+                        response['-x-permanent-redirect-url'] = response['location']
+                        response['-location'] = absolute_uri 
+                        _updateCache(headers, response, content, self.cache, cachekey)
+                    if headers.has_key('if-none-match'):
+                        del headers['if-none-match']
+                    if headers.has_key('if-modified-since'):
+                        del headers['if-modified-since']
+                    if response.has_key('location'):
+                        location = response['location']
+                        old_response = copy.deepcopy(response)
+                        old_response['-location'] = absolute_uri 
+                        redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
+                        (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
+                        response.previous = old_response
+                else:
+                    raise RedirectLimit( _("Redirected more times than rediection_limit allows."))
+            elif response.status in [200, 203] and method == "GET":
+                # Don't cache 206's since we aren't going to handle byte range requests
+                response['-location'] = absolute_uri 
+                _updateCache(headers, response, content, self.cache, cachekey)
+
+        return (response, content)
+
+    def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS):
+        """ Performs a single HTTP request.
+The 'uri' is the URI of the HTTP resource and can begin 
+with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
+
+The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc. 
+There is no restriction on the methods allowed.
+
+The 'body' is the entity body to be sent with the request. It is a string
+object.
+
+Any extra headers that are to be sent with the request should be provided in the
+'headers' dictionary.
+
+The maximum number of redirect to follow before raising an 
+exception is 'redirections. The default is 5.
+
+The return value is a tuple of (response, content), the first 
+being and instance of the 'Response' class, the second being 
+a string that contains the response entity body.
+        """
+        if headers is None:
+            headers = {}
+        else:
+            headers = _normalize_headers(headers)
+
+        if not headers.has_key('user-agent'):
+            headers['user-agent'] = "Python-httplib2/%s" % __version__
+
+        (scheme, authority, path, query, fragment) = parse_uri(uri)
+        authority = authority.lower()
+        if not path: 
+            path = "/"
+        # Could do syntax based normalization of the URI before
+        # computing the digest. See Section 6.2.2 of Std 66.
+        request_uri = query and "?".join([path, query]) or path
+        defrag_uri = scheme + "://" + authority + request_uri
+
+        if not self.connections.has_key(scheme+":"+authority):
+            connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection
+            conn = self.connections[scheme+":"+authority] = connection_type(authority)
+            conn.set_debuglevel(debuglevel)
+        else:
+            conn = self.connections[scheme+":"+authority]
+
+        if method in ["GET", "HEAD"] and 'range' not in headers:
+            headers['accept-encoding'] = 'compress, gzip'
+
+        info = rfc822.Message(StringIO.StringIO(""))
+        cached_value = None
+        if self.cache:
+            cachekey = md5.new(defrag_uri).hexdigest()
+            cached_value = self.cache.get(cachekey)
+            if cached_value:
+                try:
+                    f = StringIO.StringIO(cached_value)
+                    info = rfc822.Message(f)
+                    content = cached_value.split('\r\n\r\n', 1)[1]
+                except:
+                    self.cache.delete(cachekey)
+                    cachekey = None
+                    cached_value = None
+        else:
+            cachekey = None
+                    
+        if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag:
+            # http://www.w3.org/1999/04/Editing/ 
+            headers['if-match'] = info['etag']
+
+        if method not in ["GET", "HEAD"] and self.cache and cachekey:
+            # RFC 2616 Section 13.10
+            self.cache.delete(cachekey)
+
+        if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
+            if info.has_key('-x-permanent-redirect-url'):
+                # Should cached permanent redirects be counted in our redirection count? For now, yes.
+                (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
+                response.previous = Response(info)
+                response.previous.fromcache = True
+            else:
+                # Determine our course of action:
+                #   Is the cached entry fresh or stale?
+                #   Has the client requested a non-cached response?
+                #   
+                # There seems to be three possible answers: 
+                # 1. [FRESH] Return the cache entry w/o doing a GET
+                # 2. [STALE] Do the GET (but add in cache validators if available)
+                # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
+                entry_disposition = _entry_disposition(info, headers) 
+                
+                if entry_disposition == "FRESH":
+                    if not cached_value:
+                        info['status'] = '504'
+                        content = ""
+                    response = Response(info)
+                    if cached_value:
+                        response.fromcache = True
+                    return (response, content)
+
+                if entry_disposition == "STALE":
+                    if info.has_key('etag') and not self.ignore_etag:
+                        headers['if-none-match'] = info['etag']
+                    if info.has_key('last-modified'):
+                        headers['if-modified-since'] = info['last-modified']
+                elif entry_disposition == "TRANSPARENT":
+                    pass
+
+                (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
+
+            if response.status == 304 and method == "GET":
+                # Rewrite the cache entry with the new end-to-end headers
+                # Take all headers that are in response 
+                # and overwrite their values in info.
+                # unless they are hop-by-hop, or are listed in the connection header.
+
+                for key in _get_end2end_headers(response):
+                    info[key] = response[key]
+                merged_response = Response(info)
+                if hasattr(response, "_stale_digest"):
+                    merged_response._stale_digest = response._stale_digest
+                try:
+                    _updateCache(headers, merged_response, content, self.cache, cachekey)
+                except:
+                    print locals()
+                    raise 
+                response = merged_response
+                response.status = 200
+                response.fromcache = True 
+
+            elif response.status == 200:
+                content = new_content
+            else:
+                self.cache.delete(cachekey)
+                content = new_content 
+        else: 
+            (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
+        return (response, content)
+
+ 
+
+class Response(dict):
+    """An object more like rfc822.Message than httplib.HTTPResponse."""
+   
+    """Is this response from our local cache"""
+    fromcache = False
+
+    """HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
+    version = 11
+
+    "Status code returned by server. "
+    status = 200
+
+    """Reason phrase returned by server."""
+    reason = "Ok"
+
+    previous = None
+
+    def __init__(self, info):
+        # info is either an rfc822.Message or 
+        # an httplib.HTTPResponse object.
+        if isinstance(info, httplib.HTTPResponse):
+            for key, value in info.getheaders(): 
+                self[key] = value 
+            self.status = info.status
+            self['status'] = str(self.status)
+            self.reason = info.reason
+            self.version = info.version
+        elif isinstance(info, rfc822.Message):
+            for key, value in info.items(): 
+                self[key] = value 
+            self.status = int(self['status'])
+
+    def __getattr__(self, name):
+        if name == 'dict':
+            return self 
+        else:  
+            raise AttributeError, name 
+
+
--- a/planet/spider.py
+++ b/planet/spider.py
@ -4,10 +4,11 @@ and write each as a set of entries in a cache directory.
 """

 # Standard library modules
-import time, calendar, re, os
+import time, calendar, re, os, urlparse
 from xml.dom import minidom
 # Planet modules
-import planet, config, feedparser, reconstitute, shell
+import planet, config, feedparser, reconstitute, shell, socket
+from StringIO import StringIO 

 # Regular expressions to sanitise cache filenames
 re_url_scheme    = re.compile(r'^\w+:/*(\w+:|www\.)?')
@ -116,8 +117,11 @@ def scrub(feed, data):
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))
+def _is_http_uri(uri):
+    parsed = urlparse.urlparse(uri)
+    return parsed[0] in ['http', 'https']

-def spiderFeed(feed, only_if_new=0):
+def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
    """ Spider (fetch) a single feed """
    log = planet.logger

@ -125,6 +129,7 @@ def spiderFeed(feed, only_if_new=0):
    sources = config.cache_sources_directory()
    if not os.path.exists(sources):
        os.makedirs(sources, 0700)
+
    feed_source = filename(sources, feed)
    feed_info = feedparser.parse(feed_source)
    if feed_info.feed and only_if_new:
@ -135,6 +140,17 @@ def spiderFeed(feed, only_if_new=0):
        return

    # read feed itself
+    if content:
+        # httplib2 was used to get the content, so prepare a 
+        # proper object to pass to feedparser.
+        f = StringIO(content) 
+        setattr(f, 'url', resp_headers.get('-location', feed))
+        if resp_headers:
+            if resp_headers.has_key('content-encoding'):
+                del resp_headers['content-encoding']
+            setattr(f, 'headers', resp_headers)
+        data = feedparser.parse(f)
+    else:
        modified = None
        try:
            modified=time.strptime(
@ -326,12 +342,99 @@ def spiderFeed(feed, only_if_new=0):
 def spiderPlanet(only_if_new = False):
    """ Spider (fetch) an entire planet """
    log = planet.getLogger(config.log_level(),config.log_format())
-    planet.setTimeout(config.feed_timeout())

    global index
    index = True

-    for feed in config.subscriptions():
+    timeout = config.feed_timeout()
+    try:
+        socket.setdefaulttimeout(float(timeout))
+    except:
+        try:
+            from planet import timeoutsocket
+            timeoutsocket.setDefaultSocketTimeout(float(timeout))
+            log.info("Socket timeout set to %d seconds", timeout)
+        except:
+            log.warning("Timeout set to invalid value '%s', skipping", timeout)
+
+    if int(config.spider_threads()):
+        from Queue import Queue, Empty
+        from threading import Thread
+        import httplib2
+        from socket import gaierror, error 
+
+        work_queue = Queue()
+        awaiting_parsing = Queue()
+
+        http_cache = config.http_cache_directory()
+        if not os.path.exists(http_cache):
+            os.makedirs(http_cache, 0700)
+
+        def _spider_proc(thread_index):
+            h = httplib2.Http(http_cache)
+            try:
+                while True:
+                    # The non-blocking get will throw an exception when the queue 
+                    # is empty which will terminate the thread.
+                    uri = work_queue.get(block=False)
+                    log.info("Fetching %s via %d", uri, thread_index)
+                    try:
+                        (resp, content) = h.request(uri)
+                        awaiting_parsing.put(block=True, item=(resp, content, uri))
+                    except gaierror:
+                        log.error("Fail to resolve server name %s via %d", uri, thread_index)
+                    except error, e:
+                        log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
+                    except Exception, e:
+                        import sys, traceback
+                        type, value, tb = sys.exc_info()
+                        log.error('Error processing %s', uri)
+                        for line in (traceback.format_exception_only(type, value) +
+                            traceback.format_tb(tb)):
+                            log.error(line.rstrip())
+ 
+            except Empty, e:
+                log.info("Thread %d finished", thread_index)
+                pass
+
+        # Load the work_queue with all the HTTP(S) uris.
+        map(work_queue.put, [uri for uri in config.subscriptions() if _is_http_uri(uri)])
+
+        # Start all the worker threads
+        threads = dict([(i, Thread(target=_spider_proc, args=(i,))) for i in range(int(config.spider_threads()))])
+        for t in threads.itervalues():
+            t.start()
+
+        # Process the results as they arrive
+        while work_queue.qsize() or awaiting_parsing.qsize() or threads:
+            if awaiting_parsing.qsize() == 0 and threads:
+                time.sleep(1)
+            while awaiting_parsing.qsize():
+                item = awaiting_parsing.get(False)
+                try:
+                    (resp_headers, content, uri) = item
+                    if not resp_headers.fromcache:
+                        if resp_headers.status < 300:
+                            log.info("Parsing pre-fetched %s", uri)
+                            spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
+                        else:
+                            log.error("Status code %d from %s", resp_headers.status, uri)
+                except Exception, e:
+                    import sys, traceback
+                    type, value, tb = sys.exc_info()
+                    log.error('Error processing %s', uri)
+                    for line in (traceback.format_exception_only(type, value) +
+                        traceback.format_tb(tb)):
+                        log.error(line.rstrip())
+            for index in threads.keys():
+                if not threads[index].isAlive():
+                    del threads[index]
+    log.info("Finished threaded part of processing.")
+                    
+
+    # Process non-HTTP uris if we are threading, otherwise process *all* uris here.
+    unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
+    for feed in unthreaded_work_queue:
        try:
            spiderFeed(feed, only_if_new=only_if_new)
        except Exception,e:
@ -341,3 +444,6 @@ def spiderPlanet(only_if_new = False):
            for line in (traceback.format_exception_only(type, value) +
                traceback.format_tb(tb)):
                log.error(line.rstrip())
+
+
+
--- a/tests/data/spider/threaded.ini
+++ b/tests/data/spider/threaded.ini
@ -0,0 +1,19 @@
+[Planet]
+name = test planet
+cache_directory = tests/work/spider/cache
+spider_threads = 2
+
+# for testing purposes, must equal port number below
+test_port = 8098
+
+[http://127.0.0.1:8098/tests/data/spider/testfeed0.atom]
+name = not found
+
+[http://127.0.0.1:8098/tests/data/spider/testfeed1b.atom]
+name = one
+
+[http://127.0.0.1:8098/tests/data/spider/testfeed2.atom]
+name = two
+
+[http://127.0.0.1:8098/tests/data/spider/testfeed3.rss]
+name = three
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python

-import unittest, os, glob, calendar, shutil
+import unittest, os, glob, calendar, shutil, time
 from planet.spider import filename, spiderFeed, spiderPlanet
 from planet import feedparser, config
 import planet
@ -43,9 +43,7 @@ class SpiderTest(unittest.TestCase):
        self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
            filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))

-    def test_spiderFeed(self):
-        config.load(configfile)
-        spiderFeed(testfeed % '1b')
+    def verify_spiderFeed(self):
        files = glob.glob(workdir+"/*")
        files.sort()

@ -64,13 +62,18 @@ class SpiderTest(unittest.TestCase):
        self.assertEqual(os.stat(files[2]).st_mtime,
            calendar.timegm(data.entries[0].updated_parsed))

-    def test_spiderUpdate(self):
-        spiderFeed(testfeed % '1a')
-        self.test_spiderFeed()
-
-    def test_spiderPlanet(self):
+    def test_spiderFeed(self):
        config.load(configfile)
-        spiderPlanet()
+        spiderFeed(testfeed % '1b')
+        self.verify_spiderFeed()
+
+    def test_spiderUpdate(self):
+        config.load(configfile)
+        spiderFeed(testfeed % '1a')
+        spiderFeed(testfeed % '1b')
+        self.verify_spiderFeed()
+
+    def verify_spiderPlanet(self):
        files = glob.glob(workdir+"/*")

        # verify that exactly eight files + 1 source dir were produced
@ -88,3 +91,48 @@ class SpiderTest(unittest.TestCase):
            for link in data.entries[0].source.links if link.rel=='self'])
        self.assertEqual('three', data.entries[0].source.author_detail.name)

+    def test_spiderPlanet(self):
+        config.load(configfile)
+        spiderPlanet()
+        self.verify_spiderPlanet()
+
+    def test_spiderThreads(self):
+        config.load(configfile.replace('config','threaded'))
+        _PORT = config.parser.getint('Planet','test_port')
+
+        log = []
+        from SimpleHTTPServer import SimpleHTTPRequestHandler
+        class TestRequestHandler(SimpleHTTPRequestHandler):
+            def log_message(self, format, *args):
+                log.append(args)
+
+        from threading import Thread
+        class TestServerThread(Thread):
+          def __init__(self):
+              self.ready = 0
+              self.done = 0
+              Thread.__init__(self)
+          def run(self):
+              from BaseHTTPServer import HTTPServer
+              httpd = HTTPServer(('',_PORT), TestRequestHandler)
+              self.ready = 1
+              while not self.done:
+                  httpd.handle_request()
+
+        httpd = TestServerThread()
+        httpd.start()
+        while not httpd.ready:
+            time.sleep(0.1)
+
+        try:
+            spiderPlanet()
+        finally:
+            httpd.done = 1
+            import urllib
+            urllib.urlopen('http://127.0.0.1:%d/' % _PORT).read()
+
+        status = [int(rec[1]) for rec in log if str(rec[0]).startswith('GET ')]
+        status.sort()
+        self.assertEqual([200,200,200,200,404], status)
+
+        self.verify_spiderPlanet()