Spider threads
This commit is contained in:
commit
6ebbed2ab7
2
THANKS
2
THANKS
@ -4,7 +4,7 @@ Elias Torres - FOAF OnlineAccounts
|
||||
Jacques Distler - Template patches
|
||||
Michael Koziarski - HTTP Auth fix
|
||||
Brian Ewins - Win32 / Portalocker
|
||||
Joe Gregorio - Invoke same version of Python for filters
|
||||
Joe Gregorio - python versioning for filters, verbose tests, spider_threads
|
||||
Harry Fuecks - Pipe characters in file names, filter bug
|
||||
Eric van der Vlist - Filters to add language, category information
|
||||
Chris Dolan - mkdir cache; default template_dirs; fix xsltproc
|
||||
|
@ -98,6 +98,9 @@ use for logging output. Note: this configuration value is processed
|
||||
<dd>Number of seconds to wait for any given feed</dd>
|
||||
<dt><del>new_feed_items</del></dt>
|
||||
<dd>Number of items to take from new feeds</dd>
|
||||
<dt><ins>spider_threads</ins></dt>
|
||||
<dd>The number of threads to use when spidering. When set to 0, the default,
|
||||
no threads are used and spidering follows the traditional algorithm.</dd>
|
||||
</dl>
|
||||
</blockquote>
|
||||
|
||||
|
@ -54,7 +54,10 @@ if __name__ == "__main__":
|
||||
|
||||
if not offline:
|
||||
from planet import spider
|
||||
try:
|
||||
spider.spiderPlanet(only_if_new=only_if_new)
|
||||
except Exception, e:
|
||||
print e
|
||||
|
||||
from planet import splice
|
||||
doc = splice.splice()
|
||||
|
@ -31,25 +31,4 @@ def getLogger(level, format):
|
||||
return logger
|
||||
|
||||
|
||||
def setTimeout(timeout):
|
||||
""" time out rather than hang forever on ultra-slow servers."""
|
||||
if timeout:
|
||||
try:
|
||||
timeout = float(timeout)
|
||||
except:
|
||||
logger.warning("Timeout set to invalid value '%s', skipping", timeout)
|
||||
timeout = None
|
||||
|
||||
if timeout:
|
||||
try:
|
||||
from planet import timeoutsocket
|
||||
timeoutsocket.setDefaultSocketTimeout(timeout)
|
||||
logger.info("Socket timeout set to %d seconds", timeout)
|
||||
except ImportError:
|
||||
import socket
|
||||
if hasattr(socket, 'setdefaulttimeout'):
|
||||
logger.debug("timeoutsocket not found, using python function")
|
||||
socket.setdefaulttimeout(timeout)
|
||||
logger.info("Socket timeout set to %d seconds", timeout)
|
||||
else:
|
||||
logger.error("Unable to set timeout to %d seconds", timeout)
|
||||
|
@ -100,6 +100,7 @@ def __init__():
|
||||
define_planet('owner_email', '')
|
||||
define_planet('output_theme', '')
|
||||
define_planet('output_dir', 'output')
|
||||
define_planet('spider_threads', 0)
|
||||
|
||||
define_planet_list('template_files')
|
||||
define_planet_list('bill_of_materials')
|
||||
@ -282,6 +283,11 @@ def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=Tru
|
||||
except:
|
||||
logger.exception("Unable to read %s readinglist", list)
|
||||
|
||||
def http_cache_directory():
|
||||
if parser.has_option('Planet', 'http_cache_directory'):
|
||||
parser.get('Planet', 'http_cache_directory')
|
||||
else:
|
||||
return os.path.join(cache_directory(), 'sources/http')
|
||||
|
||||
def cache_sources_directory():
|
||||
if parser.has_option('Planet', 'cache_sources_directory'):
|
||||
|
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
|
||||
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
||||
"""
|
||||
|
||||
__version__ = "4.2-pre-" + "$Revision: 1.144 $"[11:16] + "-cvs"
|
||||
__version__ = "4.2-pre-" + "$Revision: 1.146 $"[11:16] + "-cvs"
|
||||
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
|
842
planet/httplib2/__init__.py
Normal file
842
planet/httplib2/__init__.py
Normal file
@ -0,0 +1,842 @@
|
||||
"""
|
||||
httplib2
|
||||
|
||||
A caching http interface that supports ETags and gzip
|
||||
to conserve bandwidth.
|
||||
|
||||
Requires Python 2.3 or later
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import generators
|
||||
|
||||
__author__ = "Joe Gregorio (joe@bitworking.org)"
|
||||
__copyright__ = "Copyright 2006, Joe Gregorio"
|
||||
__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
|
||||
"James Antill",
|
||||
"Xavier Verges Farrero",
|
||||
"Jonathan Feinberg",
|
||||
"Blair Zajac"]
|
||||
__license__ = "MIT"
|
||||
__version__ = "$Rev: 209 $"
|
||||
|
||||
import re
|
||||
import md5
|
||||
import rfc822
|
||||
import StringIO
|
||||
import gzip
|
||||
import zlib
|
||||
import httplib
|
||||
import urlparse
|
||||
import base64
|
||||
import os
|
||||
import copy
|
||||
import calendar
|
||||
import time
|
||||
import random
|
||||
import sha
|
||||
import hmac
|
||||
from gettext import gettext as _
|
||||
from socket import gaierror
|
||||
|
||||
__all__ = ['Http', 'Response', 'HttpLib2Error',
|
||||
'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent',
|
||||
'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError',
|
||||
'debuglevel']
|
||||
|
||||
|
||||
# The httplib debug level, set to a non-zero value to get debug output
|
||||
debuglevel = 0
|
||||
|
||||
# Python 2.3 support
|
||||
if 'sorted' not in __builtins__:
|
||||
def sorted(seq):
|
||||
seq.sort()
|
||||
return seq
|
||||
|
||||
# Python 2.3 support
|
||||
def HTTPResponse__getheaders(self):
|
||||
"""Return list of (header, value) tuples."""
|
||||
if self.msg is None:
|
||||
print "================================"
|
||||
raise httplib.ResponseNotReady()
|
||||
return self.msg.items()
|
||||
|
||||
if not hasattr(httplib.HTTPResponse, 'getheaders'):
|
||||
httplib.HTTPResponse.getheaders = HTTPResponse__getheaders
|
||||
|
||||
# All exceptions raised here derive from HttpLib2Error
|
||||
class HttpLib2Error(Exception): pass
|
||||
|
||||
class RedirectMissingLocation(HttpLib2Error): pass
|
||||
class RedirectLimit(HttpLib2Error): pass
|
||||
class FailedToDecompressContent(HttpLib2Error): pass
|
||||
class UnimplementedDigestAuthOptionError(HttpLib2Error): pass
|
||||
class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass
|
||||
|
||||
# Open Items:
|
||||
# -----------
|
||||
# Proxy support
|
||||
|
||||
# Are we removing the cached content too soon on PUT (only delete on 200 Maybe?)
|
||||
|
||||
# Pluggable cache storage (supports storing the cache in
|
||||
# flat files by default. We need a plug-in architecture
|
||||
# that can support Berkeley DB and Squid)
|
||||
|
||||
# == Known Issues ==
|
||||
# Does not handle a resource that uses conneg and Last-Modified but no ETag as a cache validator.
|
||||
# Does not handle Cache-Control: max-stale
|
||||
# Does not use Age: headers when calculating cache freshness.
|
||||
|
||||
|
||||
# The number of redirections to follow before giving up.
|
||||
# Note that only GET redirects are automatically followed.
|
||||
# Will also honor 301 requests by saving that info and never
|
||||
# requesting that URI again.
|
||||
DEFAULT_MAX_REDIRECTS = 5
|
||||
|
||||
# Which headers are hop-by-hop headers by default
|
||||
HOP_BY_HOP = ['connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'te', 'trailers', 'transfer-encoding', 'upgrade']
|
||||
|
||||
def _get_end2end_headers(response):
|
||||
hopbyhop = list(HOP_BY_HOP)
|
||||
hopbyhop.extend([x.strip() for x in response.get('connection', '').split(',')])
|
||||
return [header for header in response.keys() if header not in hopbyhop]
|
||||
|
||||
URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
|
||||
|
||||
def parse_uri(uri):
|
||||
"""Parses a URI using the regex given in Appendix B of RFC 3986.
|
||||
|
||||
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
||||
"""
|
||||
groups = URI.match(uri).groups()
|
||||
return (groups[1], groups[3], groups[4], groups[6], groups[8])
|
||||
|
||||
NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
|
||||
def _normalize_headers(headers):
|
||||
return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip()) for (key, value) in headers.iteritems()])
|
||||
|
||||
def _parse_cache_control(headers):
|
||||
retval = {}
|
||||
if headers.has_key('cache-control'):
|
||||
parts = headers['cache-control'].split(',')
|
||||
parts_with_args = [tuple([x.strip() for x in part.split("=")]) for part in parts if -1 != part.find("=")]
|
||||
parts_wo_args = [(name.strip(), 1) for name in parts if -1 == name.find("=")]
|
||||
retval = dict(parts_with_args + parts_wo_args)
|
||||
return retval
|
||||
|
||||
# Whether to use a strict mode to parse WWW-Authenticate headers
|
||||
# Might lead to bad results in case of ill-formed header value,
|
||||
# so disabled by default, falling back to relaxed parsing.
|
||||
# Set to true to turn on, usefull for testing servers.
|
||||
USE_WWW_AUTH_STRICT_PARSING = 0
|
||||
|
||||
# In regex below:
|
||||
# [^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+ matches a "token" as defined by HTTP
|
||||
# "(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?" matches a "quoted-string" as defined by HTTP, when LWS have already been replaced by a single space
|
||||
# Actually, as an auth-param value can be either a token or a quoted-string, they are combined in a single pattern which matches both:
|
||||
# \"?((?<=\")(?:[^\0-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x08\x0A-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?
|
||||
WWW_AUTH_STRICT = re.compile(r"^(?:\s*(?:,\s*)?([^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+)\s*=\s*\"?((?<=\")(?:[^\0-\x08\x0A-\x1f\x7f-\xff\\\"]|\\[\0-\x7f])*?(?=\")|(?<!\")[^\0-\x1f\x7f-\xff()<>@,;:\\\"/[\]?={} \t]+(?!\"))\"?)(.*)$")
|
||||
WWW_AUTH_RELAXED = re.compile(r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$")
|
||||
UNQUOTE_PAIRS = re.compile(r'\\(.)')
|
||||
def _parse_www_authenticate(headers, headername='www-authenticate'):
|
||||
"""Returns a dictionary of dictionaries, one dict
|
||||
per auth_scheme."""
|
||||
retval = {}
|
||||
if headers.has_key(headername):
|
||||
authenticate = headers[headername].strip()
|
||||
www_auth = USE_WWW_AUTH_STRICT_PARSING and WWW_AUTH_STRICT or WWW_AUTH_RELAXED
|
||||
while authenticate:
|
||||
# Break off the scheme at the beginning of the line
|
||||
if headername == 'authentication-info':
|
||||
(auth_scheme, the_rest) = ('digest', authenticate)
|
||||
else:
|
||||
(auth_scheme, the_rest) = authenticate.split(" ", 1)
|
||||
# Now loop over all the key value pairs that come after the scheme,
|
||||
# being careful not to roll into the next scheme
|
||||
match = www_auth.search(the_rest)
|
||||
auth_params = {}
|
||||
while match:
|
||||
if match and len(match.groups()) == 3:
|
||||
(key, value, the_rest) = match.groups()
|
||||
auth_params[key.lower()] = UNQUOTE_PAIRS.sub(r'\1', value) # '\\'.join([x.replace('\\', '') for x in value.split('\\\\')])
|
||||
match = www_auth.search(the_rest)
|
||||
retval[auth_scheme.lower()] = auth_params
|
||||
authenticate = the_rest.strip()
|
||||
return retval
|
||||
|
||||
|
||||
def _entry_disposition(response_headers, request_headers):
|
||||
"""Determine freshness from the Date, Expires and Cache-Control headers.
|
||||
|
||||
We don't handle the following:
|
||||
|
||||
1. Cache-Control: max-stale
|
||||
2. Age: headers are not used in the calculations.
|
||||
|
||||
Not that this algorithm is simpler than you might think
|
||||
because we are operating as a private (non-shared) cache.
|
||||
This lets us ignore 's-maxage'. We can also ignore
|
||||
'proxy-invalidate' since we aren't a proxy.
|
||||
We will never return a stale document as
|
||||
fresh as a design decision, and thus the non-implementation
|
||||
of 'max-stale'. This also lets us safely ignore 'must-revalidate'
|
||||
since we operate as if every server has sent 'must-revalidate'.
|
||||
Since we are private we get to ignore both 'public' and
|
||||
'private' parameters. We also ignore 'no-transform' since
|
||||
we don't do any transformations.
|
||||
The 'no-store' parameter is handled at a higher level.
|
||||
So the only Cache-Control parameters we look at are:
|
||||
|
||||
no-cache
|
||||
only-if-cached
|
||||
max-age
|
||||
min-fresh
|
||||
"""
|
||||
|
||||
retval = "STALE"
|
||||
cc = _parse_cache_control(request_headers)
|
||||
cc_response = _parse_cache_control(response_headers)
|
||||
|
||||
if request_headers.has_key('pragma') and request_headers['pragma'].lower().find('no-cache') != -1:
|
||||
retval = "TRANSPARENT"
|
||||
if 'cache-control' not in request_headers:
|
||||
request_headers['cache-control'] = 'no-cache'
|
||||
elif cc.has_key('no-cache'):
|
||||
retval = "TRANSPARENT"
|
||||
elif cc_response.has_key('no-cache'):
|
||||
retval = "STALE"
|
||||
elif cc.has_key('only-if-cached'):
|
||||
retval = "FRESH"
|
||||
elif response_headers.has_key('date'):
|
||||
date = calendar.timegm(rfc822.parsedate_tz(response_headers['date']))
|
||||
now = time.time()
|
||||
current_age = max(0, now - date)
|
||||
if cc_response.has_key('max-age'):
|
||||
freshness_lifetime = int(cc_response['max-age'])
|
||||
elif response_headers.has_key('expires'):
|
||||
expires = rfc822.parsedate_tz(response_headers['expires'])
|
||||
freshness_lifetime = max(0, calendar.timegm(expires) - date)
|
||||
else:
|
||||
freshness_lifetime = 0
|
||||
if cc.has_key('max-age'):
|
||||
freshness_lifetime = min(freshness_lifetime, int(cc['max-age']))
|
||||
if cc.has_key('min-fresh'):
|
||||
current_age += int(cc['min-fresh'])
|
||||
if freshness_lifetime > current_age:
|
||||
retval = "FRESH"
|
||||
return retval
|
||||
|
||||
def _decompressContent(response, new_content):
|
||||
content = new_content
|
||||
try:
|
||||
if response.get('content-encoding', None) == 'gzip':
|
||||
content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
|
||||
response['content-length'] = str(len(content))
|
||||
if response.get('content-encoding', None) == 'deflate':
|
||||
content = zlib.decompress(content)
|
||||
response['content-length'] = str(len(content))
|
||||
except:
|
||||
content = ""
|
||||
raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'))
|
||||
return content
|
||||
|
||||
def _updateCache(request_headers, response_headers, content, cache, cachekey):
|
||||
if cachekey:
|
||||
cc = _parse_cache_control(request_headers)
|
||||
cc_response = _parse_cache_control(response_headers)
|
||||
if cc.has_key('no-store') or cc_response.has_key('no-store'):
|
||||
cache.delete(cachekey)
|
||||
else:
|
||||
f = StringIO.StringIO("")
|
||||
info = rfc822.Message(StringIO.StringIO(""))
|
||||
for key, value in response_headers.iteritems():
|
||||
info[key] = value
|
||||
f.write(str(info))
|
||||
f.write("\r\n\r\n")
|
||||
f.write(content)
|
||||
cache.set(cachekey, f.getvalue())
|
||||
|
||||
def _cnonce():
|
||||
dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
|
||||
return dig[:16]
|
||||
|
||||
def _wsse_username_token(cnonce, iso_now, password):
|
||||
return base64.encodestring(sha.new("%s%s%s" % (cnonce, iso_now, password)).digest()).strip()
|
||||
|
||||
|
||||
# For credentials we need two things, first
|
||||
# a pool of credential to try (not necesarily tied to BAsic, Digest, etc.)
|
||||
# Then we also need a list of URIs that have already demanded authentication
|
||||
# That list is tricky since sub-URIs can take the same auth, or the
|
||||
# auth scheme may change as you descend the tree.
|
||||
# So we also need each Auth instance to be able to tell us
|
||||
# how close to the 'top' it is.
|
||||
|
||||
class Authentication:
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
|
||||
self.path = path
|
||||
self.host = host
|
||||
self.credentials = credentials
|
||||
self.http = http
|
||||
|
||||
def depth(self, request_uri):
|
||||
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
|
||||
return request_uri[len(self.path):].count("/")
|
||||
|
||||
def inscope(self, host, request_uri):
|
||||
# XXX Should we normalize the request_uri?
|
||||
(scheme, authority, path, query, fragment) = parse_uri(request_uri)
|
||||
return (host == self.host) and path.startswith(self.path)
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers to add the appropriate
|
||||
Authorization header. Over-rise this in sub-classes."""
|
||||
pass
|
||||
|
||||
def response(self, response, content):
|
||||
"""Gives us a chance to update with new nonces
|
||||
or such returned from the last authorized response.
|
||||
Over-rise this in sub-classes if necessary.
|
||||
|
||||
Return TRUE is the request is to be retried, for
|
||||
example Digest may return stale=true.
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
|
||||
class BasicAuthentication(Authentication):
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers to add the appropriate
|
||||
Authorization header."""
|
||||
headers['authorization'] = 'Basic ' + base64.encodestring("%s:%s" % self.credentials).strip()
|
||||
|
||||
|
||||
class DigestAuthentication(Authentication):
|
||||
"""Only do qop='auth' and MD5, since that
|
||||
is all Apache currently implements"""
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
challenge = _parse_www_authenticate(response, 'www-authenticate')
|
||||
self.challenge = challenge['digest']
|
||||
qop = self.challenge.get('qop')
|
||||
self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None
|
||||
if self.challenge['qop'] is None:
|
||||
raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop))
|
||||
self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5')
|
||||
if self.challenge['algorithm'] != 'MD5':
|
||||
raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
|
||||
self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]])
|
||||
self.challenge['nc'] = 1
|
||||
|
||||
def request(self, method, request_uri, headers, content, cnonce = None):
|
||||
"""Modify the request headers"""
|
||||
H = lambda x: md5.new(x).hexdigest()
|
||||
KD = lambda s, d: H("%s:%s" % (s, d))
|
||||
A2 = "".join([method, ":", request_uri])
|
||||
self.challenge['cnonce'] = cnonce or _cnonce()
|
||||
request_digest = '"%s"' % KD(H(self.A1), "%s:%s:%s:%s:%s" % (self.challenge['nonce'],
|
||||
'%08x' % self.challenge['nc'],
|
||||
self.challenge['cnonce'],
|
||||
self.challenge['qop'], H(A2)
|
||||
))
|
||||
headers['Authorization'] = 'Digest username="%s", realm="%s", nonce="%s", uri="%s", algorithm=%s, response=%s, qop=%s, nc=%08x, cnonce="%s"' % (
|
||||
self.credentials[0],
|
||||
self.challenge['realm'],
|
||||
self.challenge['nonce'],
|
||||
request_uri,
|
||||
self.challenge['algorithm'],
|
||||
request_digest,
|
||||
self.challenge['qop'],
|
||||
self.challenge['nc'],
|
||||
self.challenge['cnonce'],
|
||||
)
|
||||
self.challenge['nc'] += 1
|
||||
|
||||
def response(self, response, content):
|
||||
if not response.has_key('authentication-info'):
|
||||
challenge = _parse_www_authenticate(response, 'www-authenticate')['digest']
|
||||
if 'true' == challenge.get('stale'):
|
||||
self.challenge['nonce'] = challenge['nonce']
|
||||
self.challenge['nc'] = 1
|
||||
return True
|
||||
else:
|
||||
updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest']
|
||||
|
||||
if updated_challenge.has_key('nextnonce'):
|
||||
self.challenge['nonce'] = updated_challenge['nextnonce']
|
||||
self.challenge['nc'] = 1
|
||||
return False
|
||||
|
||||
|
||||
class HmacDigestAuthentication(Authentication):
|
||||
"""Adapted from Robert Sayre's code and DigestAuthentication above."""
|
||||
__author__ = "Thomas Broyer (t.broyer@ltgt.net)"
|
||||
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
challenge = _parse_www_authenticate(response, 'www-authenticate')
|
||||
self.challenge = challenge['hmacdigest']
|
||||
print self.challenge
|
||||
# TODO: self.challenge['domain']
|
||||
self.challenge['reason'] = self.challenge.get('reason', 'unauthorized')
|
||||
if self.challenge['reason'] not in ['unauthorized', 'integrity']:
|
||||
self.challenge['reason'] = 'unauthorized'
|
||||
self.challenge['salt'] = self.challenge.get('salt', '')
|
||||
if not self.challenge.get('snonce'):
|
||||
raise UnimplementedHmacDigestAuthOptionError( _("The challenge doesn't contain a server nonce, or this one is empty."))
|
||||
self.challenge['algorithm'] = self.challenge.get('algorithm', 'HMAC-SHA-1')
|
||||
if self.challenge['algorithm'] not in ['HMAC-SHA-1', 'HMAC-MD5']:
|
||||
raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm']))
|
||||
self.challenge['pw-algorithm'] = self.challenge.get('pw-algorithm', 'SHA-1')
|
||||
if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']:
|
||||
raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm']))
|
||||
if self.challenge['algorithm'] == 'HMAC-MD5':
|
||||
self.hashmod = md5
|
||||
else:
|
||||
self.hashmod = sha
|
||||
if self.challenge['pw-algorithm'] == 'MD5':
|
||||
self.pwhashmod = md5
|
||||
else:
|
||||
self.pwhashmod = sha
|
||||
self.key = "".join([self.credentials[0], ":",
|
||||
self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(),
|
||||
":", self.challenge['realm']
|
||||
])
|
||||
print response['www-authenticate']
|
||||
print "".join([self.credentials[1], self.challenge['salt']])
|
||||
print "key_str = %s" % self.key
|
||||
self.key = self.pwhashmod.new(self.key).hexdigest().lower()
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers"""
|
||||
keys = _get_end2end_headers(headers)
|
||||
keylist = "".join(["%s " % k for k in keys])
|
||||
headers_val = "".join([headers[k] for k in keys])
|
||||
created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime())
|
||||
cnonce = _cnonce()
|
||||
request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val)
|
||||
print "key = %s" % self.key
|
||||
print "msg = %s" % request_digest
|
||||
request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower()
|
||||
headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % (
|
||||
self.credentials[0],
|
||||
self.challenge['realm'],
|
||||
self.challenge['snonce'],
|
||||
cnonce,
|
||||
request_uri,
|
||||
created,
|
||||
request_digest,
|
||||
keylist,
|
||||
)
|
||||
|
||||
def response(self, response, content):
|
||||
challenge = _parse_www_authenticate(response, 'www-authenticate').get('hmacdigest', {})
|
||||
if challenge.get('reason') in ['integrity', 'stale']:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class WsseAuthentication(Authentication):
|
||||
"""This is thinly tested and should not be relied upon.
|
||||
At this time there isn't any third party server to test against.
|
||||
Blogger and TypePad implemented this algorithm at one point
|
||||
but Blogger has since switched to Basic over HTTPS and
|
||||
TypePad has implemented it wrong, by never issuing a 401
|
||||
challenge but instead requiring your client to telepathically know that
|
||||
their endpoint is expecting WSSE profile="UsernameToken"."""
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers to add the appropriate
|
||||
Authorization header."""
|
||||
headers['Authorization'] = 'WSSE profile="UsernameToken"'
|
||||
iso_now = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
cnonce = _cnonce()
|
||||
password_digest = _wsse_username_token(cnonce, iso_now, self.credentials[1])
|
||||
headers['X-WSSE'] = 'UsernameToken Username="%s", PasswordDigest="%s", Nonce="%s", Created="%s"' % (
|
||||
self.credentials[0],
|
||||
password_digest,
|
||||
cnonce,
|
||||
iso_now)
|
||||
|
||||
class GoogleLoginAuthentication(Authentication):
|
||||
def __init__(self, credentials, host, request_uri, headers, response, content, http):
|
||||
from urllib import urlencode
|
||||
Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http)
|
||||
|
||||
auth = dict(Email=credentials[0], Passwd=credentials[1], service='cl', source=headers['user-agent'])
|
||||
resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'})
|
||||
lines = content.split('\n')
|
||||
d = dict([tuple(line.split("=", 1)) for line in lines if line])
|
||||
if resp.status == 403:
|
||||
self.Auth = ""
|
||||
else:
|
||||
self.Auth = d['Auth']
|
||||
|
||||
def request(self, method, request_uri, headers, content):
|
||||
"""Modify the request headers to add the appropriate
|
||||
Authorization header."""
|
||||
headers['authorization'] = 'GoogleLogin Auth=' + self.Auth
|
||||
|
||||
|
||||
AUTH_SCHEME_CLASSES = {
|
||||
"basic": BasicAuthentication,
|
||||
"wsse": WsseAuthentication,
|
||||
"digest": DigestAuthentication,
|
||||
"hmacdigest": HmacDigestAuthentication,
|
||||
"googlelogin": GoogleLoginAuthentication
|
||||
}
|
||||
|
||||
AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
|
||||
|
||||
|
||||
class FileCache:
|
||||
"""Uses a local directory as a store for cached files.
|
||||
Not really safe to use if multiple threads or processes are going to
|
||||
be running on the same cache.
|
||||
"""
|
||||
def __init__(self, cache):
|
||||
self.cache = cache
|
||||
if not os.path.exists(cache):
|
||||
os.makedirs(self.cache)
|
||||
|
||||
def get(self, key):
|
||||
retval = None
|
||||
cacheFullPath = os.path.join(self.cache, key)
|
||||
try:
|
||||
f = file(cacheFullPath, "r")
|
||||
retval = f.read()
|
||||
f.close()
|
||||
except:
|
||||
pass
|
||||
return retval
|
||||
|
||||
def set(self, key, value):
|
||||
cacheFullPath = os.path.join(self.cache, key)
|
||||
f = file(cacheFullPath, "w")
|
||||
f.write(value)
|
||||
f.close()
|
||||
|
||||
def delete(self, key):
|
||||
cacheFullPath = os.path.join(self.cache, key)
|
||||
if os.path.exists(cacheFullPath):
|
||||
os.remove(cacheFullPath)
|
||||
|
||||
class Http:
|
||||
"""An HTTP client that handles all
|
||||
methods, caching, ETags, compression,
|
||||
HTTPS, Basic, Digest, WSSE, etc.
|
||||
"""
|
||||
def __init__(self, cache=None):
|
||||
# Map domain name to an httplib connection
|
||||
self.connections = {}
|
||||
# The location of the cache, for now a directory
|
||||
# where cached responses are held.
|
||||
if cache and isinstance(cache, str):
|
||||
self.cache = FileCache(cache)
|
||||
else:
|
||||
self.cache = cache
|
||||
|
||||
# tuples of name, password
|
||||
self.credentials = []
|
||||
|
||||
# authorization objects
|
||||
self.authorizations = []
|
||||
|
||||
self.follow_all_redirects = False
|
||||
|
||||
self.ignore_etag = False
|
||||
|
||||
def _auth_from_challenge(self, host, request_uri, headers, response, content):
|
||||
"""A generator that creates Authorization objects
|
||||
that can be applied to requests.
|
||||
"""
|
||||
challenges = _parse_www_authenticate(response, 'www-authenticate')
|
||||
for cred in self.credentials:
|
||||
for scheme in AUTH_SCHEME_ORDER:
|
||||
if challenges.has_key(scheme):
|
||||
yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self)
|
||||
|
||||
def add_credentials(self, name, password):
|
||||
"""Add a name and password that will be used
|
||||
any time a request requires authentication."""
|
||||
self.credentials.append((name, password))
|
||||
|
||||
def clear_credentials(self):
|
||||
"""Remove all the names and passwords
|
||||
that are used for authentication"""
|
||||
self.credentials = []
|
||||
self.authorizations = []
|
||||
|
||||
def _conn_request(self, conn, request_uri, method, body, headers):
|
||||
for i in range(2):
|
||||
try:
|
||||
conn.request(method, request_uri, body, headers)
|
||||
response = conn.getresponse()
|
||||
except:
|
||||
if i == 0:
|
||||
conn.close()
|
||||
conn.connect()
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
content = response.read()
|
||||
response = Response(response)
|
||||
content = _decompressContent(response, content)
|
||||
|
||||
break;
|
||||
return (response, content)
|
||||
|
||||
|
||||
def _request(self, conn, host, absolute_uri, request_uri, method, body, headers, redirections, cachekey):
|
||||
"""Do the actual request using the connection object
|
||||
and also follow one level of redirects if necessary"""
|
||||
|
||||
auths = [(auth.depth(request_uri), auth) for auth in self.authorizations if auth.inscope(host, request_uri)]
|
||||
auth = auths and sorted(auths)[0][1] or None
|
||||
if auth:
|
||||
auth.request(method, request_uri, headers, body)
|
||||
|
||||
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
|
||||
|
||||
if auth:
|
||||
if auth.response(response, body):
|
||||
auth.request(method, request_uri, headers, body)
|
||||
(response, content) = self._conn_request(conn, request_uri, method, body, headers )
|
||||
response._stale_digest = 1
|
||||
|
||||
if response.status == 401:
|
||||
for authorization in self._auth_from_challenge(host, request_uri, headers, response, content):
|
||||
authorization.request(method, request_uri, headers, body)
|
||||
(response, content) = self._conn_request(conn, request_uri, method, body, headers, )
|
||||
if response.status != 401:
|
||||
self.authorizations.append(authorization)
|
||||
authorization.response(response, body)
|
||||
break
|
||||
|
||||
if (self.follow_all_redirects or method in ["GET", "HEAD"]) or response.status == 303:
|
||||
if response.status in [300, 301, 302, 303, 307]:
|
||||
# Pick out the location header and basically start from the beginning
|
||||
# remembering first to strip the ETag header and decrement our 'depth'
|
||||
if redirections:
|
||||
if not response.has_key('location') and response.status != 300:
|
||||
raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."))
|
||||
# Fix-up relative redirects (which violate an RFC 2616 MUST)
|
||||
if response.has_key('location'):
|
||||
location = response['location']
|
||||
(scheme, authority, path, query, fragment) = parse_uri(location)
|
||||
if authority == None:
|
||||
response['location'] = urlparse.urljoin(absolute_uri, location)
|
||||
if response.status == 301 and method in ["GET", "HEAD"]:
|
||||
response['-x-permanent-redirect-url'] = response['location']
|
||||
response['-location'] = absolute_uri
|
||||
_updateCache(headers, response, content, self.cache, cachekey)
|
||||
if headers.has_key('if-none-match'):
|
||||
del headers['if-none-match']
|
||||
if headers.has_key('if-modified-since'):
|
||||
del headers['if-modified-since']
|
||||
if response.has_key('location'):
|
||||
location = response['location']
|
||||
old_response = copy.deepcopy(response)
|
||||
old_response['-location'] = absolute_uri
|
||||
redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
|
||||
(response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
|
||||
response.previous = old_response
|
||||
else:
|
||||
raise RedirectLimit( _("Redirected more times than rediection_limit allows."))
|
||||
elif response.status in [200, 203] and method == "GET":
|
||||
# Don't cache 206's since we aren't going to handle byte range requests
|
||||
response['-location'] = absolute_uri
|
||||
_updateCache(headers, response, content, self.cache, cachekey)
|
||||
|
||||
return (response, content)
|
||||
|
||||
def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS):
|
||||
""" Performs a single HTTP request.
|
||||
The 'uri' is the URI of the HTTP resource and can begin
|
||||
with either 'http' or 'https'. The value of 'uri' must be an absolute URI.
|
||||
|
||||
The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc.
|
||||
There is no restriction on the methods allowed.
|
||||
|
||||
The 'body' is the entity body to be sent with the request. It is a string
|
||||
object.
|
||||
|
||||
Any extra headers that are to be sent with the request should be provided in the
|
||||
'headers' dictionary.
|
||||
|
||||
The maximum number of redirect to follow before raising an
|
||||
exception is 'redirections. The default is 5.
|
||||
|
||||
The return value is a tuple of (response, content), the first
|
||||
being and instance of the 'Response' class, the second being
|
||||
a string that contains the response entity body.
|
||||
"""
|
||||
if headers is None:
|
||||
headers = {}
|
||||
else:
|
||||
headers = _normalize_headers(headers)
|
||||
|
||||
if not headers.has_key('user-agent'):
|
||||
headers['user-agent'] = "Python-httplib2/%s" % __version__
|
||||
|
||||
(scheme, authority, path, query, fragment) = parse_uri(uri)
|
||||
authority = authority.lower()
|
||||
if not path:
|
||||
path = "/"
|
||||
# Could do syntax based normalization of the URI before
|
||||
# computing the digest. See Section 6.2.2 of Std 66.
|
||||
request_uri = query and "?".join([path, query]) or path
|
||||
defrag_uri = scheme + "://" + authority + request_uri
|
||||
|
||||
if not self.connections.has_key(scheme+":"+authority):
|
||||
connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection
|
||||
conn = self.connections[scheme+":"+authority] = connection_type(authority)
|
||||
conn.set_debuglevel(debuglevel)
|
||||
else:
|
||||
conn = self.connections[scheme+":"+authority]
|
||||
|
||||
if method in ["GET", "HEAD"] and 'range' not in headers:
|
||||
headers['accept-encoding'] = 'compress, gzip'
|
||||
|
||||
info = rfc822.Message(StringIO.StringIO(""))
|
||||
cached_value = None
|
||||
if self.cache:
|
||||
cachekey = md5.new(defrag_uri).hexdigest()
|
||||
cached_value = self.cache.get(cachekey)
|
||||
if cached_value:
|
||||
try:
|
||||
f = StringIO.StringIO(cached_value)
|
||||
info = rfc822.Message(f)
|
||||
content = cached_value.split('\r\n\r\n', 1)[1]
|
||||
except:
|
||||
self.cache.delete(cachekey)
|
||||
cachekey = None
|
||||
cached_value = None
|
||||
else:
|
||||
cachekey = None
|
||||
|
||||
if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag:
|
||||
# http://www.w3.org/1999/04/Editing/
|
||||
headers['if-match'] = info['etag']
|
||||
|
||||
if method not in ["GET", "HEAD"] and self.cache and cachekey:
|
||||
# RFC 2616 Section 13.10
|
||||
self.cache.delete(cachekey)
|
||||
|
||||
if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
|
||||
if info.has_key('-x-permanent-redirect-url'):
|
||||
# Should cached permanent redirects be counted in our redirection count? For now, yes.
|
||||
(response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
|
||||
response.previous = Response(info)
|
||||
response.previous.fromcache = True
|
||||
else:
|
||||
# Determine our course of action:
|
||||
# Is the cached entry fresh or stale?
|
||||
# Has the client requested a non-cached response?
|
||||
#
|
||||
# There seems to be three possible answers:
|
||||
# 1. [FRESH] Return the cache entry w/o doing a GET
|
||||
# 2. [STALE] Do the GET (but add in cache validators if available)
|
||||
# 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request
|
||||
entry_disposition = _entry_disposition(info, headers)
|
||||
|
||||
if entry_disposition == "FRESH":
|
||||
if not cached_value:
|
||||
info['status'] = '504'
|
||||
content = ""
|
||||
response = Response(info)
|
||||
if cached_value:
|
||||
response.fromcache = True
|
||||
return (response, content)
|
||||
|
||||
if entry_disposition == "STALE":
|
||||
if info.has_key('etag') and not self.ignore_etag:
|
||||
headers['if-none-match'] = info['etag']
|
||||
if info.has_key('last-modified'):
|
||||
headers['if-modified-since'] = info['last-modified']
|
||||
elif entry_disposition == "TRANSPARENT":
|
||||
pass
|
||||
|
||||
(response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
|
||||
|
||||
if response.status == 304 and method == "GET":
|
||||
# Rewrite the cache entry with the new end-to-end headers
|
||||
# Take all headers that are in response
|
||||
# and overwrite their values in info.
|
||||
# unless they are hop-by-hop, or are listed in the connection header.
|
||||
|
||||
for key in _get_end2end_headers(response):
|
||||
info[key] = response[key]
|
||||
merged_response = Response(info)
|
||||
if hasattr(response, "_stale_digest"):
|
||||
merged_response._stale_digest = response._stale_digest
|
||||
try:
|
||||
_updateCache(headers, merged_response, content, self.cache, cachekey)
|
||||
except:
|
||||
print locals()
|
||||
raise
|
||||
response = merged_response
|
||||
response.status = 200
|
||||
response.fromcache = True
|
||||
|
||||
elif response.status == 200:
|
||||
content = new_content
|
||||
else:
|
||||
self.cache.delete(cachekey)
|
||||
content = new_content
|
||||
else:
|
||||
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
|
||||
return (response, content)
|
||||
|
||||
|
||||
|
||||
class Response(dict):
|
||||
"""An object more like rfc822.Message than httplib.HTTPResponse."""
|
||||
|
||||
"""Is this response from our local cache"""
|
||||
fromcache = False
|
||||
|
||||
"""HTTP protocol version used by server. 10 for HTTP/1.0, 11 for HTTP/1.1. """
|
||||
version = 11
|
||||
|
||||
"Status code returned by server. "
|
||||
status = 200
|
||||
|
||||
"""Reason phrase returned by server."""
|
||||
reason = "Ok"
|
||||
|
||||
previous = None
|
||||
|
||||
def __init__(self, info):
|
||||
# info is either an rfc822.Message or
|
||||
# an httplib.HTTPResponse object.
|
||||
if isinstance(info, httplib.HTTPResponse):
|
||||
for key, value in info.getheaders():
|
||||
self[key] = value
|
||||
self.status = info.status
|
||||
self['status'] = str(self.status)
|
||||
self.reason = info.reason
|
||||
self.version = info.version
|
||||
elif isinstance(info, rfc822.Message):
|
||||
for key, value in info.items():
|
||||
self[key] = value
|
||||
self.status = int(self['status'])
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name == 'dict':
|
||||
return self
|
||||
else:
|
||||
raise AttributeError, name
|
||||
|
||||
|
116
planet/spider.py
116
planet/spider.py
@ -4,10 +4,11 @@ and write each as a set of entries in a cache directory.
|
||||
"""
|
||||
|
||||
# Standard library modules
|
||||
import time, calendar, re, os
|
||||
import time, calendar, re, os, urlparse
|
||||
from xml.dom import minidom
|
||||
# Planet modules
|
||||
import planet, config, feedparser, reconstitute, shell
|
||||
import planet, config, feedparser, reconstitute, shell, socket
|
||||
from StringIO import StringIO
|
||||
|
||||
# Regular expressions to sanitise cache filenames
|
||||
re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?')
|
||||
@ -116,8 +117,11 @@ def scrub(feed, data):
|
||||
source.author_detail.has_key('name'):
|
||||
source.author_detail['name'] = \
|
||||
str(stripHtml(source.author_detail.name))
|
||||
def _is_http_uri(uri):
|
||||
parsed = urlparse.urlparse(uri)
|
||||
return parsed[0] in ['http', 'https']
|
||||
|
||||
def spiderFeed(feed, only_if_new=0):
|
||||
def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
|
||||
""" Spider (fetch) a single feed """
|
||||
log = planet.logger
|
||||
|
||||
@ -125,6 +129,7 @@ def spiderFeed(feed, only_if_new=0):
|
||||
sources = config.cache_sources_directory()
|
||||
if not os.path.exists(sources):
|
||||
os.makedirs(sources, 0700)
|
||||
|
||||
feed_source = filename(sources, feed)
|
||||
feed_info = feedparser.parse(feed_source)
|
||||
if feed_info.feed and only_if_new:
|
||||
@ -135,6 +140,17 @@ def spiderFeed(feed, only_if_new=0):
|
||||
return
|
||||
|
||||
# read feed itself
|
||||
if content:
|
||||
# httplib2 was used to get the content, so prepare a
|
||||
# proper object to pass to feedparser.
|
||||
f = StringIO(content)
|
||||
setattr(f, 'url', resp_headers.get('-location', feed))
|
||||
if resp_headers:
|
||||
if resp_headers.has_key('content-encoding'):
|
||||
del resp_headers['content-encoding']
|
||||
setattr(f, 'headers', resp_headers)
|
||||
data = feedparser.parse(f)
|
||||
else:
|
||||
modified = None
|
||||
try:
|
||||
modified=time.strptime(
|
||||
@ -326,12 +342,99 @@ def spiderFeed(feed, only_if_new=0):
|
||||
def spiderPlanet(only_if_new = False):
|
||||
""" Spider (fetch) an entire planet """
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
planet.setTimeout(config.feed_timeout())
|
||||
|
||||
global index
|
||||
index = True
|
||||
|
||||
for feed in config.subscriptions():
|
||||
timeout = config.feed_timeout()
|
||||
try:
|
||||
socket.setdefaulttimeout(float(timeout))
|
||||
except:
|
||||
try:
|
||||
from planet import timeoutsocket
|
||||
timeoutsocket.setDefaultSocketTimeout(float(timeout))
|
||||
log.info("Socket timeout set to %d seconds", timeout)
|
||||
except:
|
||||
log.warning("Timeout set to invalid value '%s', skipping", timeout)
|
||||
|
||||
if int(config.spider_threads()):
|
||||
from Queue import Queue, Empty
|
||||
from threading import Thread
|
||||
import httplib2
|
||||
from socket import gaierror, error
|
||||
|
||||
work_queue = Queue()
|
||||
awaiting_parsing = Queue()
|
||||
|
||||
http_cache = config.http_cache_directory()
|
||||
if not os.path.exists(http_cache):
|
||||
os.makedirs(http_cache, 0700)
|
||||
|
||||
def _spider_proc(thread_index):
|
||||
h = httplib2.Http(http_cache)
|
||||
try:
|
||||
while True:
|
||||
# The non-blocking get will throw an exception when the queue
|
||||
# is empty which will terminate the thread.
|
||||
uri = work_queue.get(block=False)
|
||||
log.info("Fetching %s via %d", uri, thread_index)
|
||||
try:
|
||||
(resp, content) = h.request(uri)
|
||||
awaiting_parsing.put(block=True, item=(resp, content, uri))
|
||||
except gaierror:
|
||||
log.error("Fail to resolve server name %s via %d", uri, thread_index)
|
||||
except error, e:
|
||||
log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
|
||||
except Exception, e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', uri)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
|
||||
except Empty, e:
|
||||
log.info("Thread %d finished", thread_index)
|
||||
pass
|
||||
|
||||
# Load the work_queue with all the HTTP(S) uris.
|
||||
map(work_queue.put, [uri for uri in config.subscriptions() if _is_http_uri(uri)])
|
||||
|
||||
# Start all the worker threads
|
||||
threads = dict([(i, Thread(target=_spider_proc, args=(i,))) for i in range(int(config.spider_threads()))])
|
||||
for t in threads.itervalues():
|
||||
t.start()
|
||||
|
||||
# Process the results as they arrive
|
||||
while work_queue.qsize() or awaiting_parsing.qsize() or threads:
|
||||
if awaiting_parsing.qsize() == 0 and threads:
|
||||
time.sleep(1)
|
||||
while awaiting_parsing.qsize():
|
||||
item = awaiting_parsing.get(False)
|
||||
try:
|
||||
(resp_headers, content, uri) = item
|
||||
if not resp_headers.fromcache:
|
||||
if resp_headers.status < 300:
|
||||
log.info("Parsing pre-fetched %s", uri)
|
||||
spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
|
||||
else:
|
||||
log.error("Status code %d from %s", resp_headers.status, uri)
|
||||
except Exception, e:
|
||||
import sys, traceback
|
||||
type, value, tb = sys.exc_info()
|
||||
log.error('Error processing %s', uri)
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
for index in threads.keys():
|
||||
if not threads[index].isAlive():
|
||||
del threads[index]
|
||||
log.info("Finished threaded part of processing.")
|
||||
|
||||
|
||||
# Process non-HTTP uris if we are threading, otherwise process *all* uris here.
|
||||
unthreaded_work_queue = [uri for uri in config.subscriptions() if not int(config.spider_threads()) or not _is_http_uri(uri)]
|
||||
for feed in unthreaded_work_queue:
|
||||
try:
|
||||
spiderFeed(feed, only_if_new=only_if_new)
|
||||
except Exception,e:
|
||||
@ -341,3 +444,6 @@ def spiderPlanet(only_if_new = False):
|
||||
for line in (traceback.format_exception_only(type, value) +
|
||||
traceback.format_tb(tb)):
|
||||
log.error(line.rstrip())
|
||||
|
||||
|
||||
|
||||
|
19
tests/data/spider/threaded.ini
Normal file
19
tests/data/spider/threaded.ini
Normal file
@ -0,0 +1,19 @@
|
||||
[Planet]
|
||||
name = test planet
|
||||
cache_directory = tests/work/spider/cache
|
||||
spider_threads = 2
|
||||
|
||||
# for testing purposes, must equal port number below
|
||||
test_port = 8098
|
||||
|
||||
[http://127.0.0.1:8098/tests/data/spider/testfeed0.atom]
|
||||
name = not found
|
||||
|
||||
[http://127.0.0.1:8098/tests/data/spider/testfeed1b.atom]
|
||||
name = one
|
||||
|
||||
[http://127.0.0.1:8098/tests/data/spider/testfeed2.atom]
|
||||
name = two
|
||||
|
||||
[http://127.0.0.1:8098/tests/data/spider/testfeed3.rss]
|
||||
name = three
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import unittest, os, glob, calendar, shutil
|
||||
import unittest, os, glob, calendar, shutil, time
|
||||
from planet.spider import filename, spiderFeed, spiderPlanet
|
||||
from planet import feedparser, config
|
||||
import planet
|
||||
@ -43,9 +43,7 @@ class SpiderTest(unittest.TestCase):
|
||||
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
|
||||
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
|
||||
|
||||
def test_spiderFeed(self):
|
||||
config.load(configfile)
|
||||
spiderFeed(testfeed % '1b')
|
||||
def verify_spiderFeed(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
files.sort()
|
||||
|
||||
@ -64,13 +62,18 @@ class SpiderTest(unittest.TestCase):
|
||||
self.assertEqual(os.stat(files[2]).st_mtime,
|
||||
calendar.timegm(data.entries[0].updated_parsed))
|
||||
|
||||
def test_spiderUpdate(self):
|
||||
spiderFeed(testfeed % '1a')
|
||||
self.test_spiderFeed()
|
||||
|
||||
def test_spiderPlanet(self):
|
||||
def test_spiderFeed(self):
|
||||
config.load(configfile)
|
||||
spiderPlanet()
|
||||
spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def test_spiderUpdate(self):
|
||||
config.load(configfile)
|
||||
spiderFeed(testfeed % '1a')
|
||||
spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def verify_spiderPlanet(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
|
||||
# verify that exactly eight files + 1 source dir were produced
|
||||
@ -88,3 +91,48 @@ class SpiderTest(unittest.TestCase):
|
||||
for link in data.entries[0].source.links if link.rel=='self'])
|
||||
self.assertEqual('three', data.entries[0].source.author_detail.name)
|
||||
|
||||
def test_spiderPlanet(self):
|
||||
config.load(configfile)
|
||||
spiderPlanet()
|
||||
self.verify_spiderPlanet()
|
||||
|
||||
def test_spiderThreads(self):
|
||||
config.load(configfile.replace('config','threaded'))
|
||||
_PORT = config.parser.getint('Planet','test_port')
|
||||
|
||||
log = []
|
||||
from SimpleHTTPServer import SimpleHTTPRequestHandler
|
||||
class TestRequestHandler(SimpleHTTPRequestHandler):
|
||||
def log_message(self, format, *args):
|
||||
log.append(args)
|
||||
|
||||
from threading import Thread
|
||||
class TestServerThread(Thread):
|
||||
def __init__(self):
|
||||
self.ready = 0
|
||||
self.done = 0
|
||||
Thread.__init__(self)
|
||||
def run(self):
|
||||
from BaseHTTPServer import HTTPServer
|
||||
httpd = HTTPServer(('',_PORT), TestRequestHandler)
|
||||
self.ready = 1
|
||||
while not self.done:
|
||||
httpd.handle_request()
|
||||
|
||||
httpd = TestServerThread()
|
||||
httpd.start()
|
||||
while not httpd.ready:
|
||||
time.sleep(0.1)
|
||||
|
||||
try:
|
||||
spiderPlanet()
|
||||
finally:
|
||||
httpd.done = 1
|
||||
import urllib
|
||||
urllib.urlopen('http://127.0.0.1:%d/' % _PORT).read()
|
||||
|
||||
status = [int(rec[1]) for rec in log if str(rec[0]).startswith('GET ')]
|
||||
status.sort()
|
||||
self.assertEqual([200,200,200,200,404], status)
|
||||
|
||||
self.verify_spiderPlanet()
|
||||
|
Loading…
Reference in New Issue
Block a user