reverted feedparser to HEAD, i.e. it doesn't need changes to be used with an external http client. Made the changes as suggested by Sam on how to get httplib2 and feedparser working together. Added a 'dict' attribute to httplib2.Response to get it to work as feedparser expects.
This commit is contained in:
parent
b58d815a0d
commit
4b9e85e4f7
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
|
|||||||
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "4.2-pre-" + "$Revision: 1.145 $"[11:16] + "-cvs"
|
__version__ = "4.2-pre-" + "$Revision: 1.146 $"[11:16] + "-cvs"
|
||||||
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without modification,
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
@ -3254,7 +3254,7 @@ def _stripDoctype(data):
|
|||||||
|
|
||||||
return version, data, dict(replacement and safe_pattern.findall(replacement))
|
return version, data, dict(replacement and safe_pattern.findall(replacement))
|
||||||
|
|
||||||
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], resp_headers=None):
|
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
|
||||||
'''Parse a feed from a URL, file, stream, or string'''
|
'''Parse a feed from a URL, file, stream, or string'''
|
||||||
result = FeedParserDict()
|
result = FeedParserDict()
|
||||||
result['feed'] = FeedParserDict()
|
result['feed'] = FeedParserDict()
|
||||||
@ -3263,9 +3263,6 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
|||||||
result['bozo'] = 0
|
result['bozo'] = 0
|
||||||
if type(handlers) == types.InstanceType:
|
if type(handlers) == types.InstanceType:
|
||||||
handlers = [handlers]
|
handlers = [handlers]
|
||||||
if resp_headers:
|
|
||||||
f = None
|
|
||||||
data = url_file_stream_or_string
|
|
||||||
try:
|
try:
|
||||||
f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
|
f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
|
||||||
data = f.read()
|
data = f.read()
|
||||||
@ -3310,8 +3307,6 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
|
|||||||
result['status'] = f.status
|
result['status'] = f.status
|
||||||
if hasattr(f, 'headers'):
|
if hasattr(f, 'headers'):
|
||||||
result['headers'] = f.headers.dict
|
result['headers'] = f.headers.dict
|
||||||
if resp_headers:
|
|
||||||
result['headers'] = resp_headers
|
|
||||||
if hasattr(f, 'close'):
|
if hasattr(f, 'close'):
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
@ -715,6 +715,7 @@ a string that contains the response entity body.
|
|||||||
except:
|
except:
|
||||||
self.cache.delete(cachekey)
|
self.cache.delete(cachekey)
|
||||||
cachekey = None
|
cachekey = None
|
||||||
|
cached_value = None
|
||||||
else:
|
else:
|
||||||
cachekey = None
|
cachekey = None
|
||||||
|
|
||||||
@ -726,7 +727,7 @@ a string that contains the response entity body.
|
|||||||
# RFC 2616 Section 13.10
|
# RFC 2616 Section 13.10
|
||||||
self.cache.delete(cachekey)
|
self.cache.delete(cachekey)
|
||||||
|
|
||||||
if method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
|
if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers:
|
||||||
if info.has_key('-x-permanent-redirect-url'):
|
if info.has_key('-x-permanent-redirect-url'):
|
||||||
# Should cached permanent redirects be counted in our redirection count? For now, yes.
|
# Should cached permanent redirects be counted in our redirection count? For now, yes.
|
||||||
(response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
|
(response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1)
|
||||||
@ -825,4 +826,11 @@ class Response(dict):
|
|||||||
self[key] = value
|
self[key] = value
|
||||||
self.status = int(self['status'])
|
self.status = int(self['status'])
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
if name == 'dict':
|
||||||
|
return self
|
||||||
|
else:
|
||||||
|
raise AttributeError, name
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ import time, calendar, re, os, urlparse
|
|||||||
from xml.dom import minidom
|
from xml.dom import minidom
|
||||||
# Planet modules
|
# Planet modules
|
||||||
import planet, config, feedparser, reconstitute, shell
|
import planet, config, feedparser, reconstitute, shell
|
||||||
|
from StringIO import StringIO
|
||||||
|
|
||||||
# Regular expressions to sanitise cache filenames
|
# Regular expressions to sanitise cache filenames
|
||||||
re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?')
|
re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?')
|
||||||
@ -140,7 +141,11 @@ def spiderFeed(feed, only_if_new=0, content=None, resp_headers=None):
|
|||||||
|
|
||||||
# read feed itself
|
# read feed itself
|
||||||
if content:
|
if content:
|
||||||
data = feedparser.parse(content, resp_headers=resp_headers)
|
f = StringIO(content)
|
||||||
|
setattr(f, 'url', feed)
|
||||||
|
if resp_headers:
|
||||||
|
setattr(f, 'headers', resp_headers)
|
||||||
|
data = feedparser.parse(f)
|
||||||
else:
|
else:
|
||||||
modified = None
|
modified = None
|
||||||
try:
|
try:
|
||||||
@ -334,7 +339,7 @@ def spiderPlanet(only_if_new = False):
|
|||||||
from Queue import Queue, Empty
|
from Queue import Queue, Empty
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
import httplib2
|
import httplib2
|
||||||
from socket import gaierror
|
from socket import gaierror, error
|
||||||
|
|
||||||
work_queue = Queue()
|
work_queue = Queue()
|
||||||
awaiting_parsing = Queue()
|
awaiting_parsing = Queue()
|
||||||
@ -356,6 +361,16 @@ def spiderPlanet(only_if_new = False):
|
|||||||
awaiting_parsing.put(block=True, item=(resp, content, uri))
|
awaiting_parsing.put(block=True, item=(resp, content, uri))
|
||||||
except gaierror:
|
except gaierror:
|
||||||
log.error("Fail to resolve server name %s via %d", uri, thread_index)
|
log.error("Fail to resolve server name %s via %d", uri, thread_index)
|
||||||
|
except error, e:
|
||||||
|
log.error("HTTP Error: %s in thread-%d", str(e), thread_index)
|
||||||
|
except Exception, e:
|
||||||
|
import sys, traceback
|
||||||
|
type, value, tb = sys.exc_info()
|
||||||
|
log.error('Error processing %s', uri)
|
||||||
|
for line in (traceback.format_exception_only(type, value) +
|
||||||
|
traceback.format_tb(tb)):
|
||||||
|
log.error(line.rstrip())
|
||||||
|
|
||||||
except Empty, e:
|
except Empty, e:
|
||||||
log.info("Thread %d finished", thread_index)
|
log.info("Thread %d finished", thread_index)
|
||||||
pass
|
pass
|
||||||
@ -385,7 +400,7 @@ def spiderPlanet(only_if_new = False):
|
|||||||
except Exception, e:
|
except Exception, e:
|
||||||
import sys, traceback
|
import sys, traceback
|
||||||
type, value, tb = sys.exc_info()
|
type, value, tb = sys.exc_info()
|
||||||
log.error('Error processing %s', feed)
|
log.error('Error processing %s', uri)
|
||||||
for line in (traceback.format_exception_only(type, value) +
|
for line in (traceback.format_exception_only(type, value) +
|
||||||
traceback.format_tb(tb)):
|
traceback.format_tb(tb)):
|
||||||
log.error(line.rstrip())
|
log.error(line.rstrip())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user