diff --git a/planet/httplib2/__init__.py b/planet/httplib2/__init__.py index 3ee6b4f..69bf4ec 100644 --- a/planet/httplib2/__init__.py +++ b/planet/httplib2/__init__.py @@ -16,11 +16,13 @@ __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)", "Xavier Verges Farrero", "Jonathan Feinberg", "Blair Zajac", - "Sam Ruby"] + "Sam Ruby", + "Louis Nyffenegger"] __license__ = "MIT" -__version__ = "$Rev: 217 $" +__version__ = "$Rev: 227 $" import re +import sys import md5 import email import email.Utils @@ -41,6 +43,12 @@ import hmac from gettext import gettext as _ from socket import gaierror +if sys.version_info >= (2,3): + from iri2uri import iri2uri +else: + def iri2uri(uri): + return uri + __all__ = ['Http', 'Response', 'HttpLib2Error', 'RedirectMissingLocation', 'RedirectLimit', 'FailedToDecompressContent', 'UnimplementedDigestAuthOptionError', 'UnimplementedHmacDigestAuthOptionError', @@ -51,7 +59,7 @@ __all__ = ['Http', 'Response', 'HttpLib2Error', debuglevel = 0 # Python 2.3 support -if 'sorted' not in __builtins__: +if sys.version_info < (2,4): def sorted(seq): seq.sort() return seq @@ -60,7 +68,6 @@ if 'sorted' not in __builtins__: def HTTPResponse__getheaders(self): """Return list of (header, value) tuples.""" if self.msg is None: - print "================================" raise httplib.ResponseNotReady() return self.msg.items() @@ -75,6 +82,8 @@ class RedirectLimit(HttpLib2Error): pass class FailedToDecompressContent(HttpLib2Error): pass class UnimplementedDigestAuthOptionError(HttpLib2Error): pass class UnimplementedHmacDigestAuthOptionError(HttpLib2Error): pass +class RelativeURIError(HttpLib2Error): pass +class ServerNotFoundError(HttpLib2Error): pass # Open Items: # ----------- @@ -118,6 +127,8 @@ def parse_uri(uri): def urlnorm(uri): (scheme, authority, path, query, fragment) = parse_uri(uri) + if not scheme or not authority: + raise RelativeURIError("Only absolute URIs are allowed. uri = %s" % uri) authority = authority.lower() scheme = scheme.lower() if not path: @@ -125,6 +136,7 @@ def urlnorm(uri): # Could do syntax based normalization of the URI before # computing the digest. See Section 6.2.2 of Std 66. request_uri = query and "?".join([path, query]) or path + scheme = scheme.lower() defrag_uri = scheme + "://" + authority + request_uri return scheme, authority, request_uri, defrag_uri @@ -143,9 +155,10 @@ def safename(filename): try: if re_url_scheme.match(filename): if isinstance(filename,str): - filename=filename.decode('utf-8').encode('idna') + filename = filename.decode('utf-8') + filename = filename.encode('idna') else: - filename=filename.encode('idna') + filename = filename.encode('idna') except: pass if isinstance(filename,unicode): @@ -260,16 +273,26 @@ def _entry_disposition(response_headers, request_headers): now = time.time() current_age = max(0, now - date) if cc_response.has_key('max-age'): - freshness_lifetime = int(cc_response['max-age']) + try: + freshness_lifetime = int(cc_response['max-age']) + except: + freshness_lifetime = 0 elif response_headers.has_key('expires'): expires = email.Utils.parsedate_tz(response_headers['expires']) freshness_lifetime = max(0, calendar.timegm(expires) - date) else: freshness_lifetime = 0 if cc.has_key('max-age'): - freshness_lifetime = min(freshness_lifetime, int(cc['max-age'])) + try: + freshness_lifetime = int(cc['max-age']) + except: + freshness_lifetime = 0 if cc.has_key('min-fresh'): - current_age += int(cc['min-fresh']) + try: + min_fresh = int(cc['min-fresh']) + except: + min_fresh = 0 + current_age += min_fresh if freshness_lifetime > current_age: retval = "FRESH" return retval @@ -418,13 +441,13 @@ class DigestAuthentication(Authentication): def response(self, response, content): if not response.has_key('authentication-info'): - challenge = _parse_www_authenticate(response, 'www-authenticate')['digest'] + challenge = _parse_www_authenticate(response, 'www-authenticate').get('digest', {}) if 'true' == challenge.get('stale'): self.challenge['nonce'] = challenge['nonce'] self.challenge['nc'] = 1 return True else: - updated_challenge = _parse_www_authenticate(response, 'authentication-info')['digest'] + updated_challenge = _parse_www_authenticate(response, 'authentication-info').get('digest', {}) if updated_challenge.has_key('nextnonce'): self.challenge['nonce'] = updated_challenge['nextnonce'] @@ -440,7 +463,6 @@ class HmacDigestAuthentication(Authentication): Authentication.__init__(self, credentials, host, request_uri, headers, response, content, http) challenge = _parse_www_authenticate(response, 'www-authenticate') self.challenge = challenge['hmacdigest'] - print self.challenge # TODO: self.challenge['domain'] self.challenge['reason'] = self.challenge.get('reason', 'unauthorized') if self.challenge['reason'] not in ['unauthorized', 'integrity']: @@ -466,9 +488,6 @@ class HmacDigestAuthentication(Authentication): self.pwhashmod.new("".join([self.credentials[1], self.challenge['salt']])).hexdigest().lower(), ":", self.challenge['realm'] ]) - print response['www-authenticate'] - print "".join([self.credentials[1], self.challenge['salt']]) - print "key_str = %s" % self.key self.key = self.pwhashmod.new(self.key).hexdigest().lower() def request(self, method, request_uri, headers, content): @@ -479,8 +498,6 @@ class HmacDigestAuthentication(Authentication): created = time.strftime('%Y-%m-%dT%H:%M:%SZ',time.gmtime()) cnonce = _cnonce() request_digest = "%s:%s:%s:%s:%s" % (method, request_uri, cnonce, self.challenge['snonce'], headers_val) - print "key = %s" % self.key - print "msg = %s" % request_digest request_digest = hmac.new(self.key, request_digest, self.hashmod).hexdigest().lower() headers['Authorization'] = 'HMACDigest username="%s", realm="%s", snonce="%s", cnonce="%s", uri="%s", created="%s", response="%s", headers="%s"' % ( self.credentials[0], @@ -641,6 +658,8 @@ class Http: try: conn.request(method, request_uri, body, headers) response = conn.getresponse() + except gaierror: + raise ServerNotFoundError("Unable to find the server at %s" % request_uri) except: if i == 0: conn.close() @@ -752,6 +771,8 @@ a string that contains the response entity body. if not headers.has_key('user-agent'): headers['user-agent'] = "Python-httplib2/%s" % __version__ + uri = iri2uri(uri) + (scheme, authority, request_uri, defrag_uri) = urlnorm(uri) if not self.connections.has_key(scheme+":"+authority): @@ -780,7 +801,7 @@ a string that contains the response entity body. else: cachekey = None - if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag: + if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers: # http://www.w3.org/1999/04/Editing/ headers['if-match'] = info['etag'] @@ -815,9 +836,9 @@ a string that contains the response entity body. return (response, content) if entry_disposition == "STALE": - if info.has_key('etag') and not self.ignore_etag: + if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers: headers['if-none-match'] = info['etag'] - if info.has_key('last-modified'): + if info.has_key('last-modified') and not 'last-modified' in headers: headers['if-modified-since'] = info['last-modified'] elif entry_disposition == "TRANSPARENT": pass diff --git a/planet/httplib2/iri2uri.py b/planet/httplib2/iri2uri.py new file mode 100644 index 0000000..70667ed --- /dev/null +++ b/planet/httplib2/iri2uri.py @@ -0,0 +1,110 @@ +""" +iri2uri + +Converts an IRI to a URI. + +""" +__author__ = "Joe Gregorio (joe@bitworking.org)" +__copyright__ = "Copyright 2006, Joe Gregorio" +__contributors__ = [] +__version__ = "1.0.0" +__license__ = "MIT" +__history__ = """ +""" + +import urlparse + + +# Convert an IRI to a URI following the rules in RFC 3987 +# +# The characters we need to enocde and escape are defined in the spec: +# +# iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD +# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF +# / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD +# / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD +# / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD +# / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD +# / %xD0000-DFFFD / %xE1000-EFFFD + +escape_range = [ + (0xA0, 0xD7FF ), + (0xE000, 0xF8FF ), + (0xF900, 0xFDCF ), + (0xFDF0, 0xFFEF), + (0x10000, 0x1FFFD ), + (0x20000, 0x2FFFD ), + (0x30000, 0x3FFFD), + (0x40000, 0x4FFFD ), + (0x50000, 0x5FFFD ), + (0x60000, 0x6FFFD), + (0x70000, 0x7FFFD ), + (0x80000, 0x8FFFD ), + (0x90000, 0x9FFFD), + (0xA0000, 0xAFFFD ), + (0xB0000, 0xBFFFD ), + (0xC0000, 0xCFFFD), + (0xD0000, 0xDFFFD ), + (0xE1000, 0xEFFFD), + (0xF0000, 0xFFFFD ), + (0x100000, 0x10FFFD) +] + +def encode(c): + retval = c + i = ord(c) + for low, high in escape_range: + if i < low: + break + if i >= low and i <= high: + retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')]) + break + return retval + + +def iri2uri(uri): + """Convert an IRI to a URI. Note that IRIs must be + passed in a unicode strings. That is, do not utf-8 encode + the IRI before passing it into the function.""" + if isinstance(uri ,unicode): + (scheme, authority, path, query, fragment) = urlparse.urlsplit(uri) + authority = authority.encode('idna') + # For each character in 'ucschar' or 'iprivate' + # 1. encode as utf-8 + # 2. then %-encode each octet of that utf-8 + uri = urlparse.urlunsplit((scheme, authority, path, query, fragment)) + uri = "".join([encode(c) for c in uri]) + return uri + +if __name__ == "__main__": + import unittest + + class Test(unittest.TestCase): + + def test_uris(self): + """Test that URIs are invariant under the transformation.""" + invariant = [ + u"ftp://ftp.is.co.za/rfc/rfc1808.txt", + u"http://www.ietf.org/rfc/rfc2396.txt", + u"ldap://[2001:db8::7]/c=GB?objectClass?one", + u"mailto:John.Doe@example.com", + u"news:comp.infosystems.www.servers.unix", + u"tel:+1-816-555-1212", + u"telnet://192.0.2.16:80/", + u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ] + for uri in invariant: + self.assertEqual(uri, iri2uri(uri)) + + def test_iri(self): + """ Test that the right type of escaping is done for each part of the URI.""" + self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}")) + self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}")) + self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}")) + self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}")) + self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")) + self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))) + self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8'))) + + unittest.main() + + diff --git a/planet/spider.py b/planet/spider.py index ab09e80..3ee8515 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -254,7 +254,6 @@ def writeCache(feed_uri, feed_info, data): def httpThread(thread_index, input_queue, output_queue, log): import httplib2, md5 - from socket import gaierror, error from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) @@ -304,13 +303,12 @@ def httpThread(thread_index, input_queue, output_queue, log): if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) - except gaierror: - log.error("Fail to resolve server name %s via %d", - uri, thread_index) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) - except error, e: + except httplib2.HttpLib2Error, e: + log.error("HttpLib2Error: %s via %d", str(e), thread_index) + except socket.error, e: if e.__class__.__name__.lower()=='timeout': feed.headers['status'] = '408' log.warn("Timeout in thread-%d", thread_index)