111 lines
3.5 KiB
Python
111 lines
3.5 KiB
Python
"""
|
|
Fetch either a single feed, or a set of feeds, normalize to Atom and XHTML,
|
|
and write each as a set of entries in a cache directory.
|
|
"""
|
|
|
|
# Standard library modules
|
|
import time, calendar, re, os
|
|
from xml.dom import minidom
|
|
# Planet modules
|
|
import planet, config, feedparser, reconstitute
|
|
|
|
try:
|
|
from xml.dom.ext import PrettyPrint
|
|
except:
|
|
PrettyPrint = None
|
|
|
|
# Regular expressions to sanitise cache filenames
|
|
re_url_scheme = re.compile(r'^\w+:/*(\w+:|www\.)?')
|
|
re_slash = re.compile(r'[?/:]+')
|
|
re_initial_cruft = re.compile(r'^[,.]*')
|
|
re_final_cruft = re.compile(r'[,.]*$')
|
|
|
|
def filename(directory, filename):
|
|
"""Return a filename suitable for the cache.
|
|
|
|
Strips dangerous and common characters to create a filename we
|
|
can use to store the cache in.
|
|
"""
|
|
try:
|
|
if re_url_scheme.match(filename):
|
|
if isinstance(filename,str):
|
|
filename=filename.decode('utf-8').encode('idna')
|
|
else:
|
|
filename=filename.encode('idna')
|
|
except:
|
|
pass
|
|
filename = re_url_scheme.sub("", filename)
|
|
filename = re_slash.sub(",", filename)
|
|
filename = re_initial_cruft.sub("", filename)
|
|
filename = re_final_cruft.sub("", filename)
|
|
|
|
return os.path.join(directory, filename)
|
|
|
|
def write(xdoc, out):
|
|
""" write the document out to disk """
|
|
file = open(out,'w')
|
|
try:
|
|
PrettyPrint(xdoc, file)
|
|
except:
|
|
# known reasons for failure include no pretty printer installed,
|
|
# and absurdly high levels of markup nesting causing Python to
|
|
# declare infinite recursion.
|
|
file.seek(0)
|
|
file.write(xdoc.toxml('utf-8'))
|
|
file.close()
|
|
xdoc.unlink()
|
|
|
|
def spiderFeed(feed):
|
|
""" Spider (fetch) a single feed """
|
|
data = feedparser.parse(feed)
|
|
if not data.feed: return
|
|
|
|
# capture feed and data from the planet configuration file
|
|
if not data.feed.has_key('links'): data.feed['links'] = list()
|
|
for link in data.feed.links:
|
|
if link.rel == 'self': break
|
|
else:
|
|
data.feed.links.append(feedparser.FeedParserDict(
|
|
{'rel':'self', 'type':'application/atom+xml', 'href':feed}))
|
|
for name, value in config.feed_options(feed).items():
|
|
data.feed['planet_'+name] = value
|
|
|
|
# write the feed info to the cache
|
|
sources = config.cache_sources_directory()
|
|
if not os.path.exists(sources): os.makedirs(sources)
|
|
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
|
|
xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
|
|
reconstitute.source(xdoc.documentElement, data.feed, data.bozo)
|
|
write(xdoc, filename(sources, feed))
|
|
|
|
# write each entry to the cache
|
|
cache = config.cache_directory()
|
|
for entry in data.entries:
|
|
if not entry.has_key('id'):
|
|
entry['id'] = reconstitute.id(None, entry)
|
|
if not entry['id']: continue
|
|
|
|
out = filename(cache, entry.id)
|
|
|
|
if entry.has_key('updated_parsed'):
|
|
mtime = calendar.timegm(entry.updated_parsed)
|
|
else:
|
|
try:
|
|
mtime = os.stat(out).st_mtime
|
|
except:
|
|
mtime = time.time()
|
|
entry['updated_parsed'] = time.gmtime(mtime)
|
|
|
|
write(reconstitute.reconstitute(data, entry), out)
|
|
os.utime(out, (mtime, mtime))
|
|
|
|
def spiderPlanet(configFile):
|
|
""" Spider (fetch) an entire planet """
|
|
config.load(configFile)
|
|
log = planet.getLogger(config.log_level())
|
|
planet.setTimeout(config.feed_timeout())
|
|
|
|
for feed in config.feeds():
|
|
log.info("Updating feed %s", feed)
|
|
spiderFeed(feed)
|