Don't choke on RSS feeds containing source elements; provide better messages
if there is a fetch failure and no cached version
This commit is contained in:
parent
d871cebbb5
commit
e74cd4d6cb
@ -52,91 +52,3 @@ def setTimeout(timeout):
|
||||
logger.info("Socket timeout set to %d seconds", timeout)
|
||||
else:
|
||||
logger.error("Unable to set timeout to %d seconds", timeout)
|
||||
|
||||
def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=True):
|
||||
global logger
|
||||
try:
|
||||
|
||||
import urllib2, StringIO
|
||||
from planet.spider import filename
|
||||
|
||||
# list cache file name
|
||||
cache_filename = filename(config.cache_lists_directory(), list)
|
||||
|
||||
# retrieve list options (e.g., etag, last-modified) from cache
|
||||
options = {}
|
||||
|
||||
# add original options
|
||||
for key in orig_config.options(list):
|
||||
options[key] = orig_config.get(list, key)
|
||||
|
||||
try:
|
||||
if use_cache:
|
||||
cached_config = ConfigParser()
|
||||
cached_config.read(cache_filename)
|
||||
for option in cached_config.options(list):
|
||||
options[option] = cached_config.get(list,option)
|
||||
except:
|
||||
pass
|
||||
|
||||
cached_config = ConfigParser()
|
||||
cached_config.add_section(list)
|
||||
for key, value in options.items():
|
||||
cached_config.set(list, key, value)
|
||||
|
||||
# read list
|
||||
curdir=getattr(os.path, 'curdir', '.')
|
||||
if sys.platform.find('win') < 0:
|
||||
base = urljoin('file:', os.path.abspath(curdir))
|
||||
else:
|
||||
path = os.path.abspath(os.path.curdir)
|
||||
base = urljoin('file:///', path.replace(':','|').replace('\\','/'))
|
||||
|
||||
request = urllib2.Request(urljoin(base + '/', list))
|
||||
if options.has_key("etag"):
|
||||
request.add_header('If-None-Match', options['etag'])
|
||||
if options.has_key("last-modified"):
|
||||
request.add_header('If-Modified-Since',
|
||||
options['last-modified'])
|
||||
response = urllib2.urlopen(request)
|
||||
if response.headers.has_key('etag'):
|
||||
cached_config.set(list, 'etag', response.headers['etag'])
|
||||
if response.headers.has_key('last-modified'):
|
||||
cached_config.set(list, 'last-modified',
|
||||
response.headers['last-modified'])
|
||||
|
||||
# convert to config.ini
|
||||
data = StringIO.StringIO(response.read())
|
||||
|
||||
if callback: callback(data, cached_config)
|
||||
|
||||
# write to cache
|
||||
if use_cache:
|
||||
cache = open(cache_filename, 'w')
|
||||
cached_config.write(cache)
|
||||
cache.close()
|
||||
|
||||
# re-parse and proceed
|
||||
logger.debug("Using %s readinglist", list)
|
||||
if re_read:
|
||||
if use_cache:
|
||||
orig_config.read(cache_filename)
|
||||
else:
|
||||
cdata = StringIO.StringIO()
|
||||
cached_config.write(cdata)
|
||||
cdata.seek(0)
|
||||
orig_config.readfp(cdata)
|
||||
except:
|
||||
try:
|
||||
if re_read:
|
||||
if use_cache:
|
||||
orig_config.read(cache_filename)
|
||||
else:
|
||||
cdata = StringIO.StringIO()
|
||||
cached_config.write(cdata)
|
||||
cdata.seek(0)
|
||||
orig_config.readfp(cdata)
|
||||
logger.info("Using cached %s readinglist", list)
|
||||
except:
|
||||
logger.exception("Unable to read %s readinglist", list)
|
||||
|
||||
|
@ -182,7 +182,96 @@ def load(config_file):
|
||||
raise Exception
|
||||
|
||||
for list in reading_lists:
|
||||
planet.downloadReadingList(list, parser, data2config)
|
||||
downloadReadingList(list, parser, data2config)
|
||||
|
||||
def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=True):
|
||||
from planet import logger
|
||||
import config
|
||||
try:
|
||||
|
||||
import urllib2, StringIO
|
||||
from planet.spider import filename
|
||||
|
||||
# list cache file name
|
||||
cache_filename = filename(config.cache_lists_directory(), list)
|
||||
|
||||
# retrieve list options (e.g., etag, last-modified) from cache
|
||||
options = {}
|
||||
|
||||
# add original options
|
||||
for key in orig_config.options(list):
|
||||
options[key] = orig_config.get(list, key)
|
||||
|
||||
try:
|
||||
if use_cache:
|
||||
cached_config = ConfigParser()
|
||||
cached_config.read(cache_filename)
|
||||
for option in cached_config.options(list):
|
||||
options[option] = cached_config.get(list,option)
|
||||
except:
|
||||
pass
|
||||
|
||||
cached_config = ConfigParser()
|
||||
cached_config.add_section(list)
|
||||
for key, value in options.items():
|
||||
cached_config.set(list, key, value)
|
||||
|
||||
# read list
|
||||
curdir=getattr(os.path, 'curdir', '.')
|
||||
if sys.platform.find('win') < 0:
|
||||
base = urljoin('file:', os.path.abspath(curdir))
|
||||
else:
|
||||
path = os.path.abspath(os.path.curdir)
|
||||
base = urljoin('file:///', path.replace(':','|').replace('\\','/'))
|
||||
|
||||
request = urllib2.Request(urljoin(base + '/', list))
|
||||
if options.has_key("etag"):
|
||||
request.add_header('If-None-Match', options['etag'])
|
||||
if options.has_key("last-modified"):
|
||||
request.add_header('If-Modified-Since',
|
||||
options['last-modified'])
|
||||
response = urllib2.urlopen(request)
|
||||
if response.headers.has_key('etag'):
|
||||
cached_config.set(list, 'etag', response.headers['etag'])
|
||||
if response.headers.has_key('last-modified'):
|
||||
cached_config.set(list, 'last-modified',
|
||||
response.headers['last-modified'])
|
||||
|
||||
# convert to config.ini
|
||||
data = StringIO.StringIO(response.read())
|
||||
|
||||
if callback: callback(data, cached_config)
|
||||
|
||||
# write to cache
|
||||
if use_cache:
|
||||
cache = open(cache_filename, 'w')
|
||||
cached_config.write(cache)
|
||||
cache.close()
|
||||
|
||||
# re-parse and proceed
|
||||
logger.debug("Using %s readinglist", list)
|
||||
if re_read:
|
||||
if use_cache:
|
||||
orig_config.read(cache_filename)
|
||||
else:
|
||||
cdata = StringIO.StringIO()
|
||||
cached_config.write(cdata)
|
||||
cdata.seek(0)
|
||||
orig_config.readfp(cdata)
|
||||
except:
|
||||
try:
|
||||
if re_read:
|
||||
if use_cache:
|
||||
if not orig_config.read(cache_filename): raise Exception()
|
||||
else:
|
||||
cdata = StringIO.StringIO()
|
||||
cached_config.write(cdata)
|
||||
cdata.seek(0)
|
||||
orig_config.readfp(cdata)
|
||||
logger.info("Using cached %s readinglist", list)
|
||||
except:
|
||||
logger.exception("Unable to read %s readinglist", list)
|
||||
|
||||
|
||||
def cache_sources_directory():
|
||||
if parser.has_option('Planet', 'cache_sources_directory'):
|
||||
|
@ -133,8 +133,8 @@ def foaf2config(rdf, config, subject=None):
|
||||
{ 'content_type' : 'foaf',
|
||||
'depth' : str(depth - 1) })
|
||||
try:
|
||||
import planet
|
||||
planet.downloadReadingList(seeAlso, config,
|
||||
from planet.config import downloadReadingList
|
||||
downloadReadingList(seeAlso, config,
|
||||
lambda data, subconfig : friend2config(model, friend, seeAlso, subconfig, data),
|
||||
False)
|
||||
except:
|
||||
|
@ -225,7 +225,7 @@ def reconstitute(feed, entry):
|
||||
author(xentry, 'contributor', contributor)
|
||||
|
||||
xsource = xdoc.createElement('source')
|
||||
source(xsource, entry.get('source', feed.feed), bozo, feed.version)
|
||||
source(xsource, entry.get('source') or feed.feed, bozo, feed.version)
|
||||
xentry.appendChild(xsource)
|
||||
|
||||
return xdoc
|
||||
|
15
tests/data/reconstitute/rsssource.xml
Normal file
15
tests/data/reconstitute/rsssource.xml
Normal file
@ -0,0 +1,15 @@
|
||||
<!--
|
||||
Description: source element
|
||||
Expect: source.title == 'foo'
|
||||
-->
|
||||
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>foo</title>
|
||||
<item>
|
||||
<guid>http://example.com/1</guid>
|
||||
<source url="http://www.example.org">org</source>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
|
Loading…
Reference in New Issue
Block a user