Resync with httplib2

2006-11-19 11:56:36 -05:00 · 2006-11-19 11:56:36 -05:00 · 20cb60df7c
commit 20cb60df7c
parent c337597302
2 changed files with 134 additions and 65 deletions
--- a/planet/httplib2/init.py
+++ b/planet/httplib2/init.py
@ -1,3 +1,4 @@
+from __future__ import generators
 """
 httplib2

@ -8,21 +9,22 @@ Requires Python 2.3 or later

 """

-from __future__ import generators
-
 __author__ = "Joe Gregorio (joe@bitworking.org)"
 __copyright__ = "Copyright 2006, Joe Gregorio"
 __contributors__ = ["Thomas Broyer (t.broyer@ltgt.net)",
    "James Antill",
    "Xavier Verges Farrero",
    "Jonathan Feinberg",
-    "Blair Zajac"]
+    "Blair Zajac",
+    "Sam Ruby"]
 __license__ = "MIT"
-__version__ = "$Rev: 209 $"
+__version__ = "$Rev: 217 $"

 import re 
 import md5
-import rfc822
+import email
+import email.Utils
+import email.Message
 import StringIO
 import gzip
 import zlib
@ -114,6 +116,49 @@ def parse_uri(uri):
    groups = URI.match(uri).groups()
    return (groups[1], groups[3], groups[4], groups[6], groups[8])

+def urlnorm(uri):
+    (scheme, authority, path, query, fragment) = parse_uri(uri)
+    authority = authority.lower()
+    scheme = scheme.lower()
+    if not path: 
+        path = "/"
+    # Could do syntax based normalization of the URI before
+    # computing the digest. See Section 6.2.2 of Std 66.
+    request_uri = query and "?".join([path, query]) or path
+    defrag_uri = scheme + "://" + authority + request_uri
+    return scheme, authority, request_uri, defrag_uri
+
+
+# Cache filename construction (original borrowed from Venus http://intertwingly.net/code/venus/)
+re_url_scheme    = re.compile(r'^\w+://')
+re_slash         = re.compile(r'[?/:|]+')
+
+def safename(filename):
+    """Return a filename suitable for the cache.
+
+    Strips dangerous and common characters to create a filename we
+    can use to store the cache in.
+    """
+
+    try:
+        if re_url_scheme.match(filename):
+            if isinstance(filename,str):
+                filename=filename.decode('utf-8').encode('idna')
+            else:
+                filename=filename.encode('idna')
+    except:
+        pass
+    if isinstance(filename,unicode):
+        filename=filename.encode('utf-8')
+    filemd5 = md5.new(filename).hexdigest()
+    filename = re_url_scheme.sub("", filename)
+    filename = re_slash.sub(",", filename)
+
+    # limit length of filename
+    if len(filename)>200:
+        filename=filename[:200]
+    return ",".join((filename, filemd5))
+
 NORMALIZE_SPACE = re.compile(r'(?:\r\n)?[ \t]+')
 def _normalize_headers(headers):
    return dict([ (key.lower(), NORMALIZE_SPACE.sub(value, ' ').strip())  for (key, value) in headers.iteritems()])
@ -211,13 +256,13 @@ def _entry_disposition(response_headers, request_headers):
    elif cc.has_key('only-if-cached'):
        retval = "FRESH"
    elif response_headers.has_key('date'):
-        date = calendar.timegm(rfc822.parsedate_tz(response_headers['date']))
+        date = calendar.timegm(email.Utils.parsedate_tz(response_headers['date']))
        now = time.time()
        current_age = max(0, now - date)
        if cc_response.has_key('max-age'):
            freshness_lifetime = int(cc_response['max-age'])
        elif response_headers.has_key('expires'):
-            expires = rfc822.parsedate_tz(response_headers['expires'])
+            expires = email.Utils.parsedate_tz(response_headers['expires'])
            freshness_lifetime = max(0, calendar.timegm(expires) - date)
        else:
            freshness_lifetime = 0
@ -232,12 +277,14 @@ def _entry_disposition(response_headers, request_headers):
 def _decompressContent(response, new_content):
    content = new_content
    try:
-        if response.get('content-encoding', None) == 'gzip':
-            content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
-            response['content-length'] = str(len(content))
-        if response.get('content-encoding', None) == 'deflate':
-            content = zlib.decompress(content)
+        encoding = response.get('content-encoding', None)
+        if encoding in ['gzip', 'deflate']:
+            if encoding == 'gzip':
+                content = gzip.GzipFile(fileobj=StringIO.StringIO(new_content)).read()
+            if encoding == 'deflate':
+                content = zlib.decompress(content)
            response['content-length'] = str(len(content))
+            del response['content-encoding']
    except:
        content = ""
        raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'))
@ -250,14 +297,23 @@ def _updateCache(request_headers, response_headers, content, cache, cachekey):
        if cc.has_key('no-store') or cc_response.has_key('no-store'):
            cache.delete(cachekey)
        else:
-            f = StringIO.StringIO("")
-            info = rfc822.Message(StringIO.StringIO(""))
+            info = email.Message.Message()
            for key, value in response_headers.iteritems():
-                info[key] = value
-            f.write(str(info))
-            f.write("\r\n\r\n")
-            f.write(content)
-            cache.set(cachekey, f.getvalue())
+                if key not in ['status','content-encoding','transfer-encoding']:
+                    info[key] = value
+
+            status = response_headers.status
+            if status == 304:
+                status = 200
+
+            status_header = 'status: %d\r\n' % response_headers.status
+
+            header_str = info.as_string()
+
+            header_str = re.sub("\r(?!\n)|(?<!\r)\n", "\r\n", header_str)
+            text = "".join([status_header, header_str, content])
+
+            cache.set(cachekey, text)

 def _cnonce():
    dig = md5.new("%s:%s" % (time.ctime(), ["0123456789"[random.randrange(0, 9)] for i in range(20)])).hexdigest()
@ -498,20 +554,23 @@ AUTH_SCHEME_CLASSES = {

 AUTH_SCHEME_ORDER = ["hmacdigest", "googlelogin", "digest", "wsse", "basic"]

+def _md5(s):
+    return 

 class FileCache:
    """Uses a local directory as a store for cached files.
    Not really safe to use if multiple threads or processes are going to 
    be running on the same cache.
    """
-    def __init__(self, cache):
+    def __init__(self, cache, safe=safename): # use safe=lambda x: md5.new(x).hexdigest() for the old behavior
        self.cache = cache
+        self.safe = safe
        if not os.path.exists(cache): 
            os.makedirs(self.cache)

    def get(self, key):
        retval = None
-        cacheFullPath = os.path.join(self.cache, key)
+        cacheFullPath = os.path.join(self.cache, self.safe(key))
        try:
            f = file(cacheFullPath, "r")
            retval = f.read()
@ -521,13 +580,13 @@ class FileCache:
        return retval

    def set(self, key, value):
-        cacheFullPath = os.path.join(self.cache, key)
+        cacheFullPath = os.path.join(self.cache, self.safe(key))
        f = file(cacheFullPath, "w")
        f.write(value)
        f.close()

    def delete(self, key):
-        cacheFullPath = os.path.join(self.cache, key)
+        cacheFullPath = os.path.join(self.cache, self.safe(key))
        if os.path.exists(cacheFullPath):
            os.remove(cacheFullPath)

@ -639,7 +698,8 @@ class Http:
                            response['location'] = urlparse.urljoin(absolute_uri, location)
                    if response.status == 301 and method in ["GET", "HEAD"]:
                        response['-x-permanent-redirect-url'] = response['location']
-                        response['-location'] = absolute_uri 
+                        if not response.has_key('content-location'):
+                            response['content-location'] = absolute_uri 
                        _updateCache(headers, response, content, self.cache, cachekey)
                    if headers.has_key('if-none-match'):
                        del headers['if-none-match']
@ -648,7 +708,8 @@ class Http:
                    if response.has_key('location'):
                        location = response['location']
                        old_response = copy.deepcopy(response)
-                        old_response['-location'] = absolute_uri 
+                        if not old_response.has_key('content-location'):
+                            old_response['content-location'] = absolute_uri 
                        redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
                        (response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
                        response.previous = old_response
@ -656,7 +717,8 @@ class Http:
                    raise RedirectLimit( _("Redirected more times than rediection_limit allows."))
            elif response.status in [200, 203] and method == "GET":
                # Don't cache 206's since we aren't going to handle byte range requests
-                response['-location'] = absolute_uri 
+                if not response.has_key('content-location'):
+                    response['content-location'] = absolute_uri 
                _updateCache(headers, response, content, self.cache, cachekey)

        return (response, content)
@ -690,14 +752,7 @@ a string that contains the response entity body.
        if not headers.has_key('user-agent'):
            headers['user-agent'] = "Python-httplib2/%s" % __version__

-        (scheme, authority, path, query, fragment) = parse_uri(uri)
-        authority = authority.lower()
-        if not path: 
-            path = "/"
-        # Could do syntax based normalization of the URI before
-        # computing the digest. See Section 6.2.2 of Std 66.
-        request_uri = query and "?".join([path, query]) or path
-        defrag_uri = scheme + "://" + authority + request_uri
+        (scheme, authority, request_uri, defrag_uri) = urlnorm(uri)

        if not self.connections.has_key(scheme+":"+authority):
            connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection
@ -709,17 +764,16 @@ a string that contains the response entity body.
        if method in ["GET", "HEAD"] and 'range' not in headers:
            headers['accept-encoding'] = 'compress, gzip'

-        info = rfc822.Message(StringIO.StringIO(""))
+        info = email.Message.Message()
        cached_value = None
        if self.cache:
-            cachekey = md5.new(defrag_uri).hexdigest()
+            cachekey = defrag_uri
            cached_value = self.cache.get(cachekey)
            if cached_value:
                try:
-                    f = StringIO.StringIO(cached_value)
-                    info = rfc822.Message(f)
+                    info = email.message_from_string(cached_value)
                    content = cached_value.split('\r\n\r\n', 1)[1]
-                except:
+                except Exception, e:
                    self.cache.delete(cachekey)
                    cachekey = None
                    cached_value = None
@ -802,7 +856,7 @@ a string that contains the response entity body.
 

 class Response(dict):
-    """An object more like rfc822.Message than httplib.HTTPResponse."""
+    """An object more like email.Message than httplib.HTTPResponse."""
   
    """Is this response from our local cache"""
    fromcache = False
@ -819,7 +873,7 @@ class Response(dict):
    previous = None

    def __init__(self, info):
-        # info is either an rfc822.Message or 
+        # info is either an email.Message or 
        # an httplib.HTTPResponse object.
        if isinstance(info, httplib.HTTPResponse):
            for key, value in info.getheaders(): 
@ -828,7 +882,7 @@ class Response(dict):
            self['status'] = str(self.status)
            self.reason = info.reason
            self.version = info.version
-        elif isinstance(info, rfc822.Message):
+        elif isinstance(info, email.Message.Message):
            for key, value in info.items(): 
                self[key] = value 
            self.status = int(self['status'])
--- a/planet/spider.py
+++ b/planet/spider.py
@ -140,17 +140,7 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
        return

    # read feed itself
-    if content:
-        # httplib2 was used to get the content, so prepare a 
-        # proper object to pass to feedparser.
-        f = StringIO(content) 
-        setattr(f, 'url', resp_headers.get('-location', feed))
-        if resp_headers:
-            if resp_headers.has_key('content-encoding'):
-                del resp_headers['content-encoding']
-            setattr(f, 'headers', resp_headers)
-        data = feedparser.parse(f)
-    else:
+    if not resp_headers:
        modified = None
        try:
            modified=time.strptime(
@ -159,12 +149,25 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
            pass
        data = feedparser.parse(feed_info.feed.get('planet_http_location',feed),
            etag=feed_info.feed.get('planet_http_etag',None), modified=modified)
+    elif int(resp_headers.status) < 300:
+        # httplib2 was used to get the content, so prepare a 
+        # proper object to pass to feedparser.
+        f = StringIO(content) 
+        setattr(f, 'url', resp_headers.get('content-location', feed))
+        if resp_headers:
+            if resp_headers.has_key('content-encoding'):
+                del resp_headers['content-encoding']
+            setattr(f, 'headers', resp_headers)
+        data = feedparser.parse(f)
+    else:
+        data = feedparser.FeedParserDict({'status': int(resp_headers.status),
+            'headers':resp_headers, 'version':None, 'entries': []})

    # capture http status
    if not data.has_key("status"):
        if data.has_key("entries") and len(data.entries)>0:
            data.status = 200
-        elif data.bozo and data.bozo_exception.__class__.__name__=='Timeout':
+        elif data.bozo and data.bozo_exception.__class__.__name__.lower()=='timeout':
            data.status = 408
        else:
            data.status = 500
@ -380,13 +383,27 @@ def spiderPlanet(only_if_new = False):
                    # is empty which will terminate the thread.
                    uri = work_queue.get(block=False)
                    log.info("Fetching %s via %d", uri, thread_index)
+                    resp = feedparser.FeedParserDict({'status':'500'})
+                    content = None
                    try:
-                        (resp, content) = h.request(uri)
-                        awaiting_parsing.put(block=True, item=(resp, content, uri))
+                        try:
+                            if isinstance(uri,unicode):
+                                idna = uri.encode('idna')
+                            else:
+                                idna = uri.decode('utf-8').encode('idna')
+                            if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
+                        except:
+                            log.info("unable to map %s to a URI", uri)
+                            idna = uri
+                        (resp, content) = h.request(idna)
                    except gaierror:
                        log.error("Fail to resolve server name %s via %d", uri, thread_index)
                    except error, e:
-                        log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
+                        if e.__class__.__name__.lower()=='timeout':
+                            resp['status'] = '408'
+                            log.warn("Timeout in thread-%d", thread_index)
+                        else:
+                            log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
                    except Exception, e:
                        import sys, traceback
                        type, value, tb = sys.exc_info()
@ -394,6 +411,7 @@ def spiderPlanet(only_if_new = False):
                        for line in (traceback.format_exception_only(type, value) +
                            traceback.format_tb(tb)):
                            log.error(line.rstrip())
+                    awaiting_parsing.put(block=True, item=(resp, content, uri))
 
            except Empty, e:
                log.info("Thread %d finished", thread_index)
@ -409,18 +427,15 @@ def spiderPlanet(only_if_new = False):

        # Process the results as they arrive
        while work_queue.qsize() or awaiting_parsing.qsize() or threads:
-            if awaiting_parsing.qsize() == 0 and threads:
-                time.sleep(1)
+            while awaiting_parsing.qsize() == 0 and threads:
+                time.sleep(0.1)
            while awaiting_parsing.qsize():
                item = awaiting_parsing.get(False)
                try:
                    (resp_headers, content, uri) = item
-                    if not resp_headers.fromcache:
-                        if resp_headers.status < 300:
-                            log.info("Parsing pre-fetched %s", uri)
-                            spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
-                        else:
-                            log.error("Status code %d from %s", resp_headers.status, uri)
+                    if resp_headers.status == 200 and resp_headers.fromcache:
+                        resp_headers.status = 304
+                    spiderFeed(uri, only_if_new=only_if_new, content=content, resp_headers=resp_headers)
                except Exception, e:
                    import sys, traceback
                    type, value, tb = sys.exc_info()