Resync with httplib2

This commit is contained in:
Sam Ruby 2006-11-19 11:56:36 -05:00
parent c337597302
commit 20cb60df7c
2 changed files with 134 additions and 65 deletions

View File

@ -1,3 +1,4 @@
from __future__ import generators
"""
httplib2
@ -8,21 +9,22 @@ Requires Python 2.3 or later
"""
from __future__ import generators
__author__ = "Joe Gregorio (joe@bitworking.org)"
__copyright__ = "Copyright 2006, Joe Gregorio"
__contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
"James Antill",
"Xavier Verges Farrero",
"Jonathan Feinberg",
"Blair Zajac"]
"Blair Zajac",
"Sam Ruby"]
__license__ = "MIT"
__version__ = "$Rev: 209 $"
__version__ = "$Rev: 217 $"
import re
import md5
import rfc822
import email
import email.Utils
import email.Message
import StringIO
import gzip
import zlib
@ -114,6 +116,49 @@ def parse_uri(uri):
groups = URI.match(uri).groups()
return (groups[1], groups[3], groups[4], groups[6], groups[8])
def urlnorm(uri):
(scheme, authority, path, query, fragment) = parse_uri(uri)
authority = authority.lower()
scheme = scheme.lower()
if not path:
path = "/"
# Could do syntax based normalization of the URI before
# computing the digest. See Section 6.2.2 of Std 66.
request_uri = query and "?".join([path, query]) or path
defrag_uri = scheme + "://" + authority + request_uri
return scheme, authority, request_uri, defrag_uri
# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
re_url_scheme = re.compile(r'^\w+://')
re_slash = re.compile(r'[?/:|]+')
def safename(filename):
"""Return a filename suitable for the cache.
Strips dangerous and common characters to create a filename we
can use to store the cache in.
"""
try:
if re_url_scheme.match(filename):
if isinstance(filename,str):
filename=filename.decode('utf-8').encode('idna')
else:
filename=filename.encode('idna')
except:
pass
if isinstance(filename,unicode):
filename=filename.encode('utf-8')
filemd5 = md5.new(filename).hexdigest()
filename = re_url_scheme.sub("", filename)
filename = re_slash.sub(",", filename)
# limit length of filename
if len(filename)>200:
filename=filename[:200]
return ",".join((filename, filemd5))
NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
def _normalize_headers(headers):
return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip()) for (key, value) in headers.iteritems()])
@ -211,13 +256,13 @@ def _entry_disposition(response_headers, request_headers):
elif cc.has_key('only-if-cached'):
retval = "FRESH"
elif response_headers.has_key('date'):
date = calendar.timegm(rfc822.parsedate_tz(response_headers['date']))
date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
now = time.time()
current_age = max(0, now - date)
if cc_response.has_key('max-age'):
freshness_lifetime = int(cc_response['max-age'])
elif response_headers.has_key('expires'):
expires = rfc822.parsedate_tz(response_headers['expires'])
expires = email.Utils.parsedate_tz(response_headers['expires'])
freshness_lifetime = max(0, calendar.timegm(expires) - date)
else:
freshness_lifetime = 0
@ -232,12 +277,14 @@ def _entry_disposition(response_headers, request_headers):
def _decompressContent(response, new_content):
content = new_content
try:
if response.get('content-encoding', None) == 'gzip':
content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
response['content-length'] = str(len(content))
if response.get('content-encoding', None) == 'deflate':
content = zlib.decompress(content)
encoding = response.get('content-encoding', None)
if encoding in ['gzip', 'deflate']:
if encoding == 'gzip':
content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
if encoding == 'deflate':
content = zlib.decompress(content)
response['content-length'] = str(len(content))
del response['content-encoding']
except:
content = ""
raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'))
@ -250,14 +297,23 @@ def _updateCache(request_headers, response_headers, content, cache, cachekey):
if cc.has_key('no-store') or cc_response.has_key('no-store'):
cache.delete(cachekey)
else:
f = StringIO.StringIO("")
info = rfc822.Message(StringIO.StringIO(""))
info = email.Message.Message()
for key, value in response_headers.iteritems():
info[key] = value
f.write(str(info))
f.write("\r\n\r\n")
f.write(content)
cache.set(cachekey, f.getvalue())
if key not in ['status','content-encoding','transfer-encoding']:
info[key] = value
status = response_headers.status
if status == 304:
status = 200
status_header = 'status: %d\r\n' % response_headers.status
header_str = info.as_string()
header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
text = "".join([status_header, header_str, content])
cache.set(cachekey, text)
def _cnonce():
dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
@ -498,20 +554,23 @@ AUTH_SCHEME_CLASSES = {
AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]
def _md5(s):
return
class FileCache:
"""Uses a local directory as a store for cached files.
Not really safe to use if multiple threads or processes are going to
be running on the same cache.
"""
def __init__(self, cache):
def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
self.cache = cache
self.safe = safe
if not os.path.exists(cache):
os.makedirs(self.cache)
def get(self, key):
retval = None
cacheFullPath = os.path.join(self.cache, key)
cacheFullPath = os.path.join(self.cache, self.safe(key))
try:
f = file(cacheFullPath, "r")
retval = f.read()
@ -521,13 +580,13 @@ class FileCache:
return retval
def set(self, key, value):
cacheFullPath = os.path.join(self.cache, key)
cacheFullPath = os.path.join(self.cache, self.safe(key))
f = file(cacheFullPath, "w")
f.write(value)
f.close()
def delete(self, key):
cacheFullPath = os.path.join(self.cache, key)
cacheFullPath = os.path.join(self.cache, self.safe(key))
if os.path.exists(cacheFullPath):
os.remove(cacheFullPath)
@ -639,7 +698,8 @@ class Http:
response['location'] = urlparse.urljoin(absolute_uri, location)
if response.status == 301 and method in ["GET", "HEAD"]:
response['-x-permanent-redirect-url'] = response['location']
response['-location'] = absolute_uri
if not response.has_key('content-location'):
response['content-location'] = absolute_uri
_updateCache(headers, response, content, self.cache, cachekey)
if headers.has_key('if-none-match'):
del headers['if-none-match']
@ -648,7 +708,8 @@ class Http:
if response.has_key('location'):
location = response['location']
old_response = copy.deepcopy(response)
old_response['-location'] = absolute_uri
if not old_response.has_key('content-location'):
old_response['content-location'] = absolute_uri
redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
(response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
response.previous = old_response
@ -656,7 +717,8 @@ class Http:
raise RedirectLimit( _("Redirected more times than rediection_limit allows."))
elif response.status in [200, 203] and method == "GET":
# Don't cache 206's since we aren't going to handle byte range requests
response['-location'] = absolute_uri
if not response.has_key('content-location'):
response['content-location'] = absolute_uri
_updateCache(headers, response, content, self.cache, cachekey)
return (response, content)
@ -690,14 +752,7 @@ a string that contains the response entity body.
if not headers.has_key('user-agent'):
headers['user-agent'] = "Python-httplib2/%s" % __version__
(scheme, authority, path, query, fragment) = parse_uri(uri)
authority = authority.lower()
if not path:
path = "/"
# Could do syntax based normalization of the URI before
# computing the digest. See Section 6.2.2 of Std 66.
request_uri = query and "?".join([path, query]) or path
defrag_uri = scheme + "://" + authority + request_uri
(scheme, authority, request_uri, defrag_uri) = urlnorm(uri)
if not self.connections.has_key(scheme+":"+authority):
connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection
@ -709,17 +764,16 @@ a string that contains the response entity body.
if method in ["GET", "HEAD"] and 'range' not in headers:
headers['accept-encoding'] = 'compress, gzip'
info = rfc822.Message(StringIO.StringIO(""))
info = email.Message.Message()
cached_value = None
if self.cache:
cachekey = md5.new(defrag_uri).hexdigest()
cachekey = defrag_uri
cached_value = self.cache.get(cachekey)
if cached_value:
try:
f = StringIO.StringIO(cached_value)
info = rfc822.Message(f)
info = email.message_from_string(cached_value)
content = cached_value.split('\r\n\r\n', 1)[1]
except:
except Exception, e:
self.cache.delete(cachekey)
cachekey = None
cached_value = None
@ -802,7 +856,7 @@ a string that contains the response entity body.
class Response(dict):
"""An object more like rfc822.Message than httplib.HTTPResponse."""
"""An object more like email.Message than httplib.HTTPResponse."""
"""Is this response from our local cache"""
fromcache = False
@ -819,7 +873,7 @@ class Response(dict):
previous = None
def __init__(self, info):
# info is either an rfc822.Message or
# info is either an email.Message or
# an httplib.HTTPResponse object.
if isinstance(info, httplib.HTTPResponse):
for key, value in info.getheaders():
@ -828,7 +882,7 @@ class Response(dict):
self['status'] = str(self.status)
self.reason = info.reason
self.version = info.version
elif isinstance(info, rfc822.Message):
elif isinstance(info, email.Message.Message):
for key, value in info.items():
self[key] = value
self.status = int(self['status'])

View File

@ -140,17 +140,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
return
# read feed itself
if content:
# httplib2 was used to get the content, so prepare a
# proper object to pass to feedparser.
f = StringIO(content)
setattr(f, 'url', resp_headers.get('-location', feed))
if resp_headers:
if resp_headers.has_key('content-encoding'):
del resp_headers['content-encoding']
setattr(f, 'headers', resp_headers)
data = feedparser.parse(f)
else:
if not resp_headers:
modified = None
try:
modified=time.strptime(
@ -159,12 +149,25 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
pass
data = feedparser.parse(feed_info.feed.get('planet_http_location',feed),
etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
elif int(resp_headers.status) < 300:
# httplib2 was used to get the content, so prepare a
# proper object to pass to feedparser.
f = StringIO(content)
setattr(f, 'url', resp_headers.get('content-location', feed))
if resp_headers:
if resp_headers.has_key('content-encoding'):
del resp_headers['content-encoding']
setattr(f, 'headers', resp_headers)
data = feedparser.parse(f)
else:
data = feedparser.FeedParserDict({'status': int(resp_headers.status),
'headers':resp_headers, 'version':None, 'entries': []})
# capture http status
if not data.has_key("status"):
if data.has_key("entries") and len(data.entries)>0:
data.status = 200
elif data.bozo and data.bozo_exception.__class__.__name__=='Timeout':
elif data.bozo and data.bozo_exception.__class__.__name__.lower()=='timeout':
data.status = 408
else:
data.status = 500
@ -380,13 +383,27 @@ def spiderPlanet(only_if_new = False):
# is empty which will terminate the thread.
uri = work_queue.get(block=False)
log.info("Fetching %s via %d", uri, thread_index)
resp = feedparser.FeedParserDict({'status':'500'})
content = None
try:
(resp, content) = h.request(uri)
awaiting_parsing.put(block=True, item=(resp, content, uri))
try:
if isinstance(uri,unicode):
idna = uri.encode('idna')
else:
idna = uri.decode('utf-8').encode('idna')
if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
except:
log.info("unable to map %s to a URI", uri)
idna = uri
(resp, content) = h.request(idna)
except gaierror:
log.error("Fail to resolve server name %s via %d", uri, thread_index)
except error, e:
log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
if e.__class__.__name__.lower()=='timeout':
resp['status'] = '408'
log.warn("Timeout in thread-%d", thread_index)
else:
log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
except Exception, e:
import sys, traceback
type, value, tb = sys.exc_info()
@ -394,6 +411,7 @@ def spiderPlanet(only_if_new = False):
for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)):
log.error(line.rstrip())
awaiting_parsing.put(block=True, item=(resp, content, uri))
except Empty, e:
log.info("Thread %d finished", thread_index)
@ -409,18 +427,15 @@ def spiderPlanet(only_if_new = False):
# Process the results as they arrive
while work_queue.qsize() or awaiting_parsing.qsize() or threads:
if awaiting_parsing.qsize() == 0 and threads:
time.sleep(1)
while awaiting_parsing.qsize() == 0 and threads:
time.sleep(0.1)
while awaiting_parsing.qsize():
item = awaiting_parsing.get(False)
try:
(resp_headers, content, uri) = item
if not resp_headers.fromcache:
if resp_headers.status < 300:
log.info("Parsing pre-fetched %s", uri)
spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
else:
log.error("Status code %d from %s", resp_headers.status, uri)
if resp_headers.status == 200 and resp_headers.fromcache:
resp_headers.status = 304
spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
except Exception, e:
import sys, traceback
type, value, tb = sys.exc_info()