reverted feedparser to HEAD, i.e. it doesn't need changes to be used with an external http client. Made the changes as suggested by Sam on how to get httplib2 and feedparser working together. Added a 'dict' attribute to httplib2.Response to get it to work as feedparser expects.

This commit is contained in:
Joe Gregorio 2006-11-05 22:00:05 -05:00
parent b58d815a0d
commit 4b9e85e4f7
3 changed files with 29 additions and 11 deletions

View File

@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
"""
__version__ = "4.2-pre-" + "$Revision: 1.145 $"[11:16] + "-cvs"
__version__ = "4.2-pre-" + "$Revision: 1.146 $"[11:16] + "-cvs"
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@ -3254,7 +3254,7 @@ def _stripDoctype(data):
return version, data, dict(replacement and safe_pattern.findall(replacement))
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], resp_headers=None):
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
'''Parse a feed from a URL, file, stream, or string'''
result = FeedParserDict()
result['feed'] = FeedParserDict()
@ -3263,9 +3263,6 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
result['bozo'] = 0
if type(handlers) == types.InstanceType:
handlers = [handlers]
if resp_headers:
f = None
data = url_file_stream_or_string
try:
f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
data = f.read()
@ -3310,8 +3307,6 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
result['status'] = f.status
if hasattr(f, 'headers'):
result['headers'] = f.headers.dict
if resp_headers:
result['headers'] = resp_headers
if hasattr(f, 'close'):
f.close()

View File

@ -715,6 +715,7 @@ a string that contains the response entity body.
except:
self.cache.delete(cachekey)
cachekey = None
cached_value = None
else:
cachekey = None
@ -726,7 +727,7 @@ a string that contains the response entity body.
# RFC 2616 Section 13.10
self.cache.delete(cachekey)
if method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
if info.has_key('-x-permanent-redirect-url'):
# Should cached permanent redirects be counted in our redirection count? For now, yes.
(response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
@ -825,4 +826,11 @@ class Response(dict):
self[key] = value
self.status = int(self['status'])
def __getattr__(self, name):
if name == 'dict':
return self
else:
raise AttributeError, name

View File

@ -8,6 +8,7 @@ import time, calendar, re, os, urlparse
from xml.dom import minidom
# Planet modules
import planet, config, feedparser, reconstitute, shell
from StringIO import StringIO
# Regular expressions to sanitise cache filenames
re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?')
@ -140,7 +141,11 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
# read feed itself
if content:
data = feedparser.parse(content, resp_headers=resp_headers)
f = StringIO(content)
setattr(f, 'url', feed)
if resp_headers:
setattr(f, 'headers', resp_headers)
data = feedparser.parse(f)
else:
modified = None
try:
@ -334,7 +339,7 @@ def spiderPlanet(only_if_new = False):
from Queue import Queue, Empty
from threading import Thread
import httplib2
from socket import gaierror
from socket import gaierror, error
work_queue = Queue()
awaiting_parsing = Queue()
@ -356,6 +361,16 @@ def spiderPlanet(only_if_new = False):
awaiting_parsing.put(block=True, item=(resp, content, uri))
except gaierror:
log.error("Fail to resolve server name %s via %d", uri, thread_index)
except error, e:
log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
except Exception, e:
import sys, traceback
type, value, tb = sys.exc_info()
log.error('Error processing %s', uri)
for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)):
log.error(line.rstrip())
except Empty, e:
log.info("Thread %d finished", thread_index)
pass
@ -385,7 +400,7 @@ def spiderPlanet(only_if_new = False):
except Exception, e:
import sys, traceback
type, value, tb = sys.exc_info()
log.error('Error processing %s', feed)
log.error('Error processing %s', uri)
for line in (traceback.format_exception_only(type, value) +
traceback.format_tb(tb)):
log.error(line.rstrip())