diff --git a/planet/reconstitute.py b/planet/reconstitute.py index 760af50..8607f95 100644 --- a/planet/reconstitute.py +++ b/planet/reconstitute.py @@ -13,13 +13,18 @@ well formed XHTML. Todo: * extension elements """ -import re, time, md5, sgmllib +import re, time, sgmllib from xml.sax.saxutils import escape from xml.dom import minidom, Node from html5lib import liberalxmlparser from html5lib.treebuilders import dom import planet, config +try: + from hashlib import md5 +except: + from md5 import new as md5 + illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") def createTextElement(parent, name, value): @@ -68,14 +73,14 @@ def id(xentry, entry): entry_id = entry.link elif entry.has_key("title") and entry.title: entry_id = (entry.title_detail.base + "/" + - md5.new(entry.title).hexdigest()) + md5(entry.title).hexdigest()) elif entry.has_key("summary") and entry.summary: entry_id = (entry.summary_detail.base + "/" + - md5.new(entry.summary).hexdigest()) + md5(entry.summary).hexdigest()) elif entry.has_key("content") and entry.content: entry_id = (entry.content[0].base + "/" + - md5.new(entry.content[0].value).hexdigest()) + md5(entry.content[0].value).hexdigest()) else: return diff --git a/planet/spider.py b/planet/spider.py index 4aacc0b..59afcb6 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -10,6 +10,11 @@ from xml.dom import minidom import planet, config, feedparser, reconstitute, shell, socket, scrub from StringIO import StringIO +try: + from hashlib import md5 +except: + from md5 import new as md5 + # Regular expressions to sanitise cache filenames re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?') re_slash = re.compile(r'[?/:|]+') @@ -44,9 +49,8 @@ def filename(directory, filename): parts=filename.split(',') for i in range(len(parts),0,-1): if len(','.join(parts[:i])) < 220: - import md5 filename = ','.join(parts[:i]) + ',' + \ - md5.new(','.join(parts[i:])).hexdigest() + md5(','.join(parts[i:])).hexdigest() break return os.path.join(directory, filename) @@ -277,7 +281,7 @@ def writeCache(feed_uri, feed_info, data): xdoc.unlink() def httpThread(thread_index, input_queue, output_queue, log): - import httplib2, md5 + import httplib2 from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) @@ -312,7 +316,7 @@ def httpThread(thread_index, input_queue, output_queue, log): (resp, content) = h.request(idna, 'GET', headers=headers) # unchanged detection - resp['-content-hash'] = md5.new(content or '').hexdigest() + resp['-content-hash'] = md5(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 diff --git a/planet/vendor/httplib2/__init__.py b/planet/vendor/httplib2/__init__.py index 69bf4ec..56c018b 100644 --- a/planet/vendor/httplib2/__init__.py +++ b/planet/vendor/httplib2/__init__.py @@ -7,6 +7,9 @@ to conserve bandwidth. Requires Python 2.3 or later +Changelog: +2007-08-18, Rick: Modified so it's able to use a socks proxy if needed. + """ __author__ = "Joe Gregorio (joe@bitworking.org)" @@ -19,14 +22,14 @@ __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)", "Sam Ruby", "Louis Nyffenegger"] __license__ = "MIT" -__version__ = "$Rev: 227 $" +__version__ = "$Rev$" import re import sys -import md5 import email import email.Utils import email.Message +import email.FeedParser import StringIO import gzip import zlib @@ -38,10 +41,32 @@ import copy import calendar import time import random -import sha +# remove depracated warning in python2.6 +try: + from hashlib import sha1 as _sha, md5 as _md5 +except ImportError: + import sha + import md5 + _sha = sha.new + _md5 = md5.new import hmac from gettext import gettext as _ -from socket import gaierror +import socket + +try: + import socks +except ImportError: + socks = None + +# Build the appropriate socket wrapper for ssl +try: + import ssl # python 2.6 + _ssl_wrap_socket = ssl.wrap_socket +except ImportError: + def _ssl_wrap_socket(sock, key_file, cert_file): + ssl_sock = socket.ssl(sock, key_file, cert_file) + return httplib.FakeSocket(sock, ssl_sock) + if sys.version_info >= (2,3): from iri2uri import iri2uri @@ -49,7 +74,12 @@ else: def iri2uri(uri): return uri -__all__ = ['Http', 'Response', 'HttpLib2Error', +def has_timeout(timeout): # python 2.6 + if hasattr(socket, '_GLOBAL_DEFAULT_TIMEOUT'): + return (timeout is not None and timeout is not socket._GLOBAL_DEFAULT_TIMEOUT) + return (timeout is not None) + +__all__ = ['Http', 'Response', 'ProxyInfo', 'HttpLib2Error', 'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent', 'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError', 'debuglevel'] @@ -58,6 +88,7 @@ __all__ = ['Http', 'Response', 'HttpLib2Error', # The httplib debug level, set to a non-zero value to get debug output debuglevel = 0 + # Python 2.3 support if sys.version_info < (2,4): def sorted(seq): @@ -77,11 +108,20 @@ if not hasattr(httplib.HTTPResponse, 'getheaders'): # All exceptions raised here derive from HttpLib2Error class HttpLib2Error(Exception): pass -class RedirectMissingLocation(HttpLib2Error): pass -class RedirectLimit(HttpLib2Error): pass -class FailedToDecompressContent(HttpLib2Error): pass -class UnimplementedDigestAuthOptionError(HttpLib2Error): pass -class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass +# Some exceptions can be caught and optionally +# be turned back into responses. +class HttpLib2ErrorWithResponse(HttpLib2Error): + def __init__(self, desc, response, content): + self.response = response + self.content = content + HttpLib2Error.__init__(self, desc) + +class RedirectMissingLocation(HttpLib2ErrorWithResponse): pass +class RedirectLimit(HttpLib2ErrorWithResponse): pass +class FailedToDecompressContent(HttpLib2ErrorWithResponse): pass +class UnimplementedDigestAuthOptionError(HttpLib2ErrorWithResponse): pass +class UnimplementedHmacDigestAuthOptionError(HttpLib2ErrorWithResponse): pass + class RelativeURIError(HttpLib2Error): pass class ServerNotFoundError(HttpLib2Error): pass @@ -159,11 +199,11 @@ def safename(filename): filename = filename.encode('idna') else: filename = filename.encode('idna') - except: + except UnicodeError: pass if isinstance(filename,unicode): filename=filename.encode('utf-8') - filemd5 = md5.new(filename).hexdigest() + filemd5 = _md5(filename).hexdigest() filename = re_url_scheme.sub("", filename) filename = re_slash.sub(",", filename) @@ -180,8 +220,8 @@ def _parse_cache_control(headers): retval = {} if headers.has_key('cache-control'): parts = headers['cache-control'].split(',') - parts_with_args = [tuple([x.strip() for x in part.split("=")]) for part in parts if -1 != part.find("=")] - parts_wo_args = [(name.strip(), 1) for name in parts if -1 == name.find("=")] + parts_with_args = [tuple([x.strip().lower() for x in part.split("=", 1)]) for part in parts if -1 != part.find("=")] + parts_wo_args = [(name.strip().lower(), 1) for name in parts if -1 == name.find("=")] retval = dict(parts_with_args + parts_wo_args) return retval @@ -275,22 +315,25 @@ def _entry_disposition(response_headers, request_headers): if cc_response.has_key('max-age'): try: freshness_lifetime = int(cc_response['max-age']) - except: + except ValueError: freshness_lifetime = 0 elif response_headers.has_key('expires'): expires = email.Utils.parsedate_tz(response_headers['expires']) - freshness_lifetime = max(0, calendar.timegm(expires) - date) + if None == expires: + freshness_lifetime = 0 + else: + freshness_lifetime = max(0, calendar.timegm(expires) - date) else: freshness_lifetime = 0 if cc.has_key('max-age'): try: freshness_lifetime = int(cc['max-age']) - except: + except ValueError: freshness_lifetime = 0 if cc.has_key('min-fresh'): try: min_fresh = int(cc['min-fresh']) - except: + except ValueError: min_fresh = 0 current_age += min_fresh if freshness_lifetime > current_age: @@ -307,10 +350,12 @@ def _decompressContent(response, new_content): if encoding == 'deflate': content = zlib.decompress(content) response['content-length'] = str(len(content)) + # Record the historical presence of the encoding in a way the won't interfere. + response['-content-encoding'] = response['content-encoding'] del response['content-encoding'] - except: + except IOError: content = "" - raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding')) + raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content) return content def _updateCache(request_headers, response_headers, content, cache, cachekey): @@ -339,11 +384,11 @@ def _updateCache(request_headers, response_headers, content, cache, cachekey): cache.set(cachekey, text) def _cnonce(): - dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest() + dig = _md5("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest() return dig[:16] def _wsse_username_token(cnonce, iso_now, password): - return base64.encodestring(sha.new("%s%s%s" % (cnonce, iso_now, password)).digest()).strip() + return base64.encodestring(_sha("%s%s%s" % (cnonce, iso_now, password)).digest()).strip() # For credentials we need two things, first @@ -354,7 +399,7 @@ def _wsse_username_token(cnonce, iso_now, password): # So we also need each Auth instance to be able to tell us # how close to the 'top' it is. -class Authentication: +class Authentication(object): def __init__(self, credentials, host, request_uri, headers, response, content, http): (scheme, authority, path, query, fragment) = parse_uri(request_uri) self.path = path @@ -405,11 +450,11 @@ class DigestAuthentication(Authentication): Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http) challenge = _parse_www_authenticate(response, 'www-authenticate') self.challenge = challenge['digest'] - qop = self.challenge.get('qop') + qop = self.challenge.get('qop', 'auth') self.challenge['qop'] = ('auth' in [x.strip() for x in qop.split()]) and 'auth' or None if self.challenge['qop'] is None: raise UnimplementedDigestAuthOptionError( _("Unsupported value for qop: %s." % qop)) - self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5') + self.challenge['algorithm'] = self.challenge.get('algorithm', 'MD5').upper() if self.challenge['algorithm'] != 'MD5': raise UnimplementedDigestAuthOptionError( _("Unsupported value for algorithm: %s." % self.challenge['algorithm'])) self.A1 = "".join([self.credentials[0], ":", self.challenge['realm'], ":", self.credentials[1]]) @@ -417,7 +462,7 @@ class DigestAuthentication(Authentication): def request(self, method, request_uri, headers, content, cnonce = None): """Modify the request headers""" - H = lambda x: md5.new(x).hexdigest() + H = lambda x: _md5(x).hexdigest() KD = lambda s, d: H("%s:%s" % (s, d)) A2 = "".join([method, ":", request_uri]) self.challenge['cnonce'] = cnonce or _cnonce() @@ -477,13 +522,13 @@ class HmacDigestAuthentication(Authentication): if self.challenge['pw-algorithm'] not in ['SHA-1', 'MD5']: raise UnimplementedHmacDigestAuthOptionError( _("Unsupported value for pw-algorithm: %s." % self.challenge['pw-algorithm'])) if self.challenge['algorithm'] == 'HMAC-MD5': - self.hashmod = md5 + self.hashmod = _md5 else: - self.hashmod = sha + self.hashmod = _sha if self.challenge['pw-algorithm'] == 'MD5': - self.pwhashmod = md5 + self.pwhashmod = _md5 else: - self.pwhashmod = sha + self.pwhashmod = _sha self.key = "".join([self.credentials[0], ":", self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(), ":", self.challenge['realm'] @@ -545,8 +590,17 @@ class GoogleLoginAuthentication(Authentication): def __init__(self, credentials, host, request_uri, headers, response, content, http): from urllib import urlencode Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http) + challenge = _parse_www_authenticate(response, 'www-authenticate') + service = challenge['googlelogin'].get('service', 'xapi') + # Bloggger actually returns the service in the challenge + # For the rest we guess based on the URI + if service == 'xapi' and request_uri.find("calendar") > 0: + service = "cl" + # No point in guessing Base or Spreadsheet + #elif request_uri.find("spreadsheets") > 0: + # service = "wise" - auth = dict(Email=credentials[0], Passwd=credentials[1], service='cl', source=headers['user-agent']) + auth = dict(Email=credentials[0], Passwd=credentials[1], service=service, source=headers['user-agent']) resp, content = self.http.request("https://www.google.com/accounts/ClientLogin", method="POST", body=urlencode(auth), headers={'Content-Type': 'application/x-www-form-urlencoded'}) lines = content.split('\n') d = dict([tuple(line.split("=", 1)) for line in lines if line]) @@ -571,10 +625,7 @@ AUTH_SCHEME_CLASSES = { AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"] -def _md5(s): - return - -class FileCache: +class FileCache(object): """Uses a local directory as a store for cached files. Not really safe to use if multiple threads or processes are going to be running on the same cache. @@ -589,16 +640,16 @@ class FileCache: retval = None cacheFullPath = os.path.join(self.cache, self.safe(key)) try: - f = file(cacheFullPath, "r") + f = file(cacheFullPath, "rb") retval = f.read() f.close() - except: + except IOError: pass return retval def set(self, key, value): cacheFullPath = os.path.join(self.cache, self.safe(key)) - f = file(cacheFullPath, "w") + f = file(cacheFullPath, "wb") f.write(value) f.close() @@ -607,12 +658,131 @@ class FileCache: if os.path.exists(cacheFullPath): os.remove(cacheFullPath) -class Http: - """An HTTP client that handles all - methods, caching, ETags, compression, - HTTPS, Basic, Digest, WSSE, etc. +class Credentials(object): + def __init__(self): + self.credentials = [] + + def add(self, name, password, domain=""): + self.credentials.append((domain.lower(), name, password)) + + def clear(self): + self.credentials = [] + + def iter(self, domain): + for (cdomain, name, password) in self.credentials: + if cdomain == "" or domain == cdomain: + yield (name, password) + +class KeyCerts(Credentials): + """Identical to Credentials except that + name/password are mapped to key/cert.""" + pass + + +class ProxyInfo(object): + """Collect information required to use a proxy.""" + def __init__(self, proxy_type, proxy_host, proxy_port, proxy_rdns=None, proxy_user=None, proxy_pass=None): + """The parameter proxy_type must be set to one of socks.PROXY_TYPE_XXX + constants. For example: + +p = ProxyInfo(proxy_type=socks.PROXY_TYPE_HTTP, proxy_host='localhost', proxy_port=8000) + """ + self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, self.proxy_user, self.proxy_pass = proxy_type, proxy_host, proxy_port, proxy_rdns, proxy_user, proxy_pass + + def astuple(self): + return (self.proxy_type, self.proxy_host, self.proxy_port, self.proxy_rdns, + self.proxy_user, self.proxy_pass) + + def isgood(self): + return socks and (self.proxy_host != None) and (self.proxy_port != None) + + +class HTTPConnectionWithTimeout(httplib.HTTPConnection): + """HTTPConnection subclass that supports timeouts""" + + def __init__(self, host, port=None, strict=None, timeout=None, proxy_info=None): + httplib.HTTPConnection.__init__(self, host, port, strict) + self.timeout = timeout + self.proxy_info = proxy_info + + def connect(self): + """Connect to the host and port specified in __init__.""" + # Mostly verbatim from httplib.py. + msg = "getaddrinfo returns an empty list" + for res in socket.getaddrinfo(self.host, self.port, 0, + socket.SOCK_STREAM): + af, socktype, proto, canonname, sa = res + try: + if self.proxy_info and self.proxy_info.isgood(): + self.sock = socks.socksocket(af, socktype, proto) + self.sock.setproxy(*self.proxy_info.astuple()) + else: + self.sock = socket.socket(af, socktype, proto) + # Different from httplib: support timeouts. + if has_timeout(self.timeout): + self.sock.settimeout(self.timeout) + # End of difference from httplib. + if self.debuglevel > 0: + print "connect: (%s, %s)" % (self.host, self.port) + + self.sock.connect(sa) + except socket.error, msg: + if self.debuglevel > 0: + print 'connect fail:', (self.host, self.port) + if self.sock: + self.sock.close() + self.sock = None + continue + break + if not self.sock: + raise socket.error, msg + +class HTTPSConnectionWithTimeout(httplib.HTTPSConnection): + "This class allows communication via SSL." + + def __init__(self, host, port=None, key_file=None, cert_file=None, + strict=None, timeout=None, proxy_info=None): + httplib.HTTPSConnection.__init__(self, host, port=port, key_file=key_file, + cert_file=cert_file, strict=strict) + self.timeout = timeout + self.proxy_info = proxy_info + + def connect(self): + "Connect to a host on a given (SSL) port." + + if self.proxy_info and self.proxy_info.isgood(): + sock = socks.socksocket(socket.AF_INET, socket.SOCK_STREAM) + sock.setproxy(*self.proxy_info.astuple()) + else: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + + if has_timeout(self.timeout): + sock.settimeout(self.timeout) + sock.connect((self.host, self.port)) + self.sock =_ssl_wrap_socket(sock, self.key_file, self.cert_file) + + + +class Http(object): + """An HTTP client that handles: +- all methods +- caching +- ETags +- compression, +- HTTPS +- Basic +- Digest +- WSSE + +and more. """ - def __init__(self, cache=None): + def __init__(self, cache=None, timeout=None, proxy_info=None): + """The value of proxy_info is a ProxyInfo instance. + +If 'cache' is a string then it is used as a directory name +for a disk cache. Otherwise it must be an object that supports +the same interface as FileCache.""" + self.proxy_info = proxy_info # Map domain name to an httplib connection self.connections = {} # The location of the cache, for now a directory @@ -622,45 +792,72 @@ class Http: else: self.cache = cache - # tuples of name, password - self.credentials = [] + # Name/password + self.credentials = Credentials() + + # Key/cert + self.certificates = KeyCerts() # authorization objects self.authorizations = [] + # If set to False then no redirects are followed, even safe ones. + self.follow_redirects = True + + # Which HTTP methods do we apply optimistic concurrency to, i.e. + # which methods get an "if-match:" etag header added to them. + self.optimistic_concurrency_methods = ["PUT"] + + # If 'follow_redirects' is True, and this is set to True then + # all redirecs are followed, including unsafe ones. self.follow_all_redirects = False self.ignore_etag = False + self.force_exception_to_status_code = False + + self.timeout = timeout + def _auth_from_challenge(self, host, request_uri, headers, response, content): """A generator that creates Authorization objects that can be applied to requests. """ challenges = _parse_www_authenticate(response, 'www-authenticate') - for cred in self.credentials: + for cred in self.credentials.iter(host): for scheme in AUTH_SCHEME_ORDER: if challenges.has_key(scheme): - yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self) + yield AUTH_SCHEME_CLASSES[scheme](cred, host, request_uri, headers, response, content, self) - def add_credentials(self, name, password): + def add_credentials(self, name, password, domain=""): """Add a name and password that will be used any time a request requires authentication.""" - self.credentials.append((name, password)) + self.credentials.add(name, password, domain) + + def add_certificate(self, key, cert, domain): + """Add a key and cert that will be used + any time a request requires authentication.""" + self.certificates.add(key, cert, domain) def clear_credentials(self): """Remove all the names and passwords that are used for authentication""" - self.credentials = [] + self.credentials.clear() self.authorizations = [] def _conn_request(self, conn, request_uri, method, body, headers): for i in range(2): try: conn.request(method, request_uri, body, headers) + except socket.gaierror: + conn.close() + raise ServerNotFoundError("Unable to find the server at %s" % conn.host) + except (socket.error, httplib.HTTPException): + # Just because the server closed the connection doesn't apparently mean + # that the server didn't send a response. + pass + try: response = conn.getresponse() - except gaierror: - raise ServerNotFoundError("Unable to find the server at %s" % request_uri) - except: + except (socket.error, httplib.HTTPException): if i == 0: conn.close() conn.connect() @@ -668,11 +865,13 @@ class Http: else: raise else: - content = response.read() + content = "" + if method != "HEAD": + content = response.read() response = Response(response) - content = _decompressContent(response, content) - - break; + if method != "HEAD": + content = _decompressContent(response, content) + break return (response, content) @@ -702,13 +901,13 @@ class Http: authorization.response(response, body) break - if (self.follow_all_redirects or method in ["GET", "HEAD"]) or response.status == 303: - if response.status in [300, 301, 302, 303, 307]: + if (self.follow_all_redirects or (method in ["GET", "HEAD"]) or response.status == 303): + if self.follow_redirects and response.status in [300, 301, 302, 303, 307]: # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if redirections: if not response.has_key('location') and response.status != 300: - raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header.")) + raise RedirectMissingLocation( _("Redirected but the response is missing a Location: header."), response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if response.has_key('location'): location = response['location'] @@ -733,7 +932,7 @@ class Http: (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1) response.previous = old_response else: - raise RedirectLimit( _("Redirected more times than rediection_limit allows.")) + raise RedirectLimit( _("Redirected more times than rediection_limit allows."), response, content) elif response.status in [200, 203] and method == "GET": # Don't cache 206's since we aren't going to handle byte range requests if not response.has_key('content-location'): @@ -742,7 +941,13 @@ class Http: return (response, content) - def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS): + +# Need to catch and rebrand some exceptions +# Then need to optionally turn all exceptions into status codes +# including all socket.* and httplib.* exceptions. + + + def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None): """ Performs a single HTTP request. The 'uri' is the URI of the HTTP resource and can begin with either 'http' or 'https'. The value of 'uri' must be an absolute URI. @@ -763,115 +968,164 @@ The return value is a tuple of (response, content), the first being and instance of the 'Response' class, the second being a string that contains the response entity body. """ - if headers is None: - headers = {} - else: - headers = _normalize_headers(headers) - - if not headers.has_key('user-agent'): - headers['user-agent'] = "Python-httplib2/%s" % __version__ - - uri = iri2uri(uri) - - (scheme, authority, request_uri, defrag_uri) = urlnorm(uri) - - if not self.connections.has_key(scheme+":"+authority): - connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection - conn = self.connections[scheme+":"+authority] = connection_type(authority) - conn.set_debuglevel(debuglevel) - else: - conn = self.connections[scheme+":"+authority] - - if method in ["GET", "HEAD"] and 'range' not in headers: - headers['accept-encoding'] = 'compress, gzip' - - info = email.Message.Message() - cached_value = None - if self.cache: - cachekey = defrag_uri - cached_value = self.cache.get(cachekey) - if cached_value: - try: - info = email.message_from_string(cached_value) - content = cached_value.split('\r\n\r\n', 1)[1] - except Exception, e: - self.cache.delete(cachekey) - cachekey = None - cached_value = None - else: - cachekey = None - - if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers: - # http://www.w3.org/1999/04/Editing/ - headers['if-match'] = info['etag'] - - if method not in ["GET", "HEAD"] and self.cache and cachekey: - # RFC 2616 Section 13.10 - self.cache.delete(cachekey) - - if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers: - if info.has_key('-x-permanent-redirect-url'): - # Should cached permanent redirects be counted in our redirection count? For now, yes. - (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1) - response.previous = Response(info) - response.previous.fromcache = True + try: + if headers is None: + headers = {} else: - # Determine our course of action: - # Is the cached entry fresh or stale? - # Has the client requested a non-cached response? - # - # There seems to be three possible answers: - # 1. [FRESH] Return the cache entry w/o doing a GET - # 2. [STALE] Do the GET (but add in cache validators if available) - # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request - entry_disposition = _entry_disposition(info, headers) - - if entry_disposition == "FRESH": - if not cached_value: - info['status'] = '504' - content = "" - response = Response(info) - if cached_value: - response.fromcache = True - return (response, content) + headers = _normalize_headers(headers) - if entry_disposition == "STALE": - if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers: - headers['if-none-match'] = info['etag'] - if info.has_key('last-modified') and not 'last-modified' in headers: - headers['if-modified-since'] = info['last-modified'] - elif entry_disposition == "TRANSPARENT": - pass + if not headers.has_key('user-agent'): + headers['user-agent'] = "Python-httplib2/%s" % __version__ - (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) + uri = iri2uri(uri) - if response.status == 304 and method == "GET": - # Rewrite the cache entry with the new end-to-end headers - # Take all headers that are in response - # and overwrite their values in info. - # unless they are hop-by-hop, or are listed in the connection header. + (scheme, authority, request_uri, defrag_uri) = urlnorm(uri) + domain_port = authority.split(":")[0:2] + if len(domain_port) == 2 and domain_port[1] == '443' and scheme == 'http': + scheme = 'https' + authority = domain_port[0] - for key in _get_end2end_headers(response): - info[key] = response[key] - merged_response = Response(info) - if hasattr(response, "_stale_digest"): - merged_response._stale_digest = response._stale_digest - try: - _updateCache(headers, merged_response, content, self.cache, cachekey) - except: - print locals() - raise - response = merged_response - response.status = 200 - response.fromcache = True - - elif response.status == 200: - content = new_content + conn_key = scheme+":"+authority + if conn_key in self.connections: + conn = self.connections[conn_key] else: + if not connection_type: + connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout + certs = list(self.certificates.iter(authority)) + if scheme == 'https' and certs: + conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0], + cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info) + else: + conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info) + conn.set_debuglevel(debuglevel) + + if method in ["GET", "HEAD"] and 'range' not in headers and 'accept-encoding' not in headers: + headers['accept-encoding'] = 'deflate, gzip' + + info = email.Message.Message() + cached_value = None + if self.cache: + cachekey = defrag_uri + cached_value = self.cache.get(cachekey) + if cached_value: + # info = email.message_from_string(cached_value) + # + # Need to replace the line above with the kludge below + # to fix the non-existent bug not fixed in this + # bug report: http://mail.python.org/pipermail/python-bugs-list/2005-September/030289.html + try: + info, content = cached_value.split('\r\n\r\n', 1) + feedparser = email.FeedParser.FeedParser() + feedparser.feed(info) + info = feedparser.close() + feedparser._parse = None + except IndexError: + self.cache.delete(cachekey) + cachekey = None + cached_value = None + else: + cachekey = None + + if method in self.optimistic_concurrency_methods and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers: + # http://www.w3.org/1999/04/Editing/ + headers['if-match'] = info['etag'] + + if method not in ["GET", "HEAD"] and self.cache and cachekey: + # RFC 2616 Section 13.10 self.cache.delete(cachekey) - content = new_content - else: - (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) + + if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers: + if info.has_key('-x-permanent-redirect-url'): + # Should cached permanent redirects be counted in our redirection count? For now, yes. + (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1) + response.previous = Response(info) + response.previous.fromcache = True + else: + # Determine our course of action: + # Is the cached entry fresh or stale? + # Has the client requested a non-cached response? + # + # There seems to be three possible answers: + # 1. [FRESH] Return the cache entry w/o doing a GET + # 2. [STALE] Do the GET (but add in cache validators if available) + # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request + entry_disposition = _entry_disposition(info, headers) + + if entry_disposition == "FRESH": + if not cached_value: + info['status'] = '504' + content = "" + response = Response(info) + if cached_value: + response.fromcache = True + return (response, content) + + if entry_disposition == "STALE": + if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers: + headers['if-none-match'] = info['etag'] + if info.has_key('last-modified') and not 'last-modified' in headers: + headers['if-modified-since'] = info['last-modified'] + elif entry_disposition == "TRANSPARENT": + pass + + (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) + + if response.status == 304 and method == "GET": + # Rewrite the cache entry with the new end-to-end headers + # Take all headers that are in response + # and overwrite their values in info. + # unless they are hop-by-hop, or are listed in the connection header. + + for key in _get_end2end_headers(response): + info[key] = response[key] + merged_response = Response(info) + if hasattr(response, "_stale_digest"): + merged_response._stale_digest = response._stale_digest + _updateCache(headers, merged_response, content, self.cache, cachekey) + response = merged_response + response.status = 200 + response.fromcache = True + + elif response.status == 200: + content = new_content + else: + self.cache.delete(cachekey) + content = new_content + else: + cc = _parse_cache_control(headers) + if cc.has_key('only-if-cached'): + info['status'] = '504' + response = Response(info) + content = "" + else: + (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) + except Exception, e: + if self.force_exception_to_status_code: + if isinstance(e, HttpLib2ErrorWithResponse): + response = e.response + content = e.content + response.status = 500 + response.reason = str(e) + elif isinstance(e, socket.timeout): + content = "Request Timeout" + response = Response( { + "content-type": "text/plain", + "status": "408", + "content-length": len(content) + }) + response.reason = "Request Timeout" + else: + content = str(e) + response = Response( { + "content-type": "text/plain", + "status": "400", + "content-length": len(content) + }) + response.reason = "Bad Request" + else: + raise + + return (response, content) @@ -898,7 +1152,7 @@ class Response(dict): # an httplib.HTTPResponse object. if isinstance(info, httplib.HTTPResponse): for key, value in info.getheaders(): - self[key] = value + self[key.lower()] = value self.status = info.status self['status'] = str(self.status) self.reason = info.reason @@ -907,11 +1161,14 @@ class Response(dict): for key, value in info.items(): self[key] = value self.status = int(self['status']) + else: + for key, value in info.iteritems(): + self[key] = value + self.status = int(self.get('status', self.status)) + def __getattr__(self, name): if name == 'dict': return self else: raise AttributeError, name - -