Change planet:subscription to planet:source, and expand the information

provided in each.
This commit is contained in:
Sam Ruby 2006-08-18 15:47:10 -04:00
parent 6c0e24fd00
commit 9fa9fb6117
28 changed files with 195 additions and 51 deletions

View File

@ -19,13 +19,15 @@
<h2>Subscriptions</h2> <h2>Subscriptions</h2>
<ul> <ul>
<xsl:for-each select="planet:subscription"> <xsl:for-each select="planet:source">
<xsl:sort select="planet:name"/> <xsl:sort select="planet:name"/>
<li> <li>
<a href="{atom:link[@rel='self']/@href}" title="subscribe"> <a href="{atom:link[@rel='self']/@href}" title="subscribe">
<img src="images/feed-icon-10x10.png" alt="(feed)"/> <img src="images/feed-icon-10x10.png" alt="(feed)"/>
</a> </a>
<xsl:value-of select="planet:name"/> <a href="{atom:link[@rel='alternate']/@href}">
<xsl:value-of select="planet:name"/>
</a>
</li> </li>
</xsl:for-each> </xsl:for-each>
</ul> </ul>

View File

@ -1,3 +1,5 @@
xmlns = 'http://planet.intertwingly.net/'
logger = None logger = None
def getLogger(level): def getLogger(level):

View File

@ -26,7 +26,7 @@ Todo:
* error handling (example: no planet section) * error handling (example: no planet section)
""" """
import sys import os, sys
from ConfigParser import ConfigParser from ConfigParser import ConfigParser
parser = ConfigParser() parser = ConfigParser()
@ -83,6 +83,12 @@ def template_files():
""" list the templates defined """ """ list the templates defined """
return parser.get('Planet','template_files').split(' ') return parser.get('Planet','template_files').split(' ')
def cache_sources_directory():
if parser.has_option('Planet', 'cache_sources_directory'):
parser.get('Planet', 'cache_sources_directory')
else:
return os.path.join(cache_directory(), 'sources')
def feeds(): def feeds():
""" list the feeds defined """ """ list the feeds defined """
return filter(lambda feed: feed!='Planet' and feed not in template_files(), return filter(lambda feed: feed!='Planet' and feed not in template_files(),

View File

@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
""" """
__version__ = "4.2-pre-" + "$Revision: 1.131 $"[11:16] + "-cvs" __version__ = "4.2-pre-" + "$Revision: 1.132 $"[11:16] + "-cvs"
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification, Redistribution and use in source and binary forms, with or without modification,
@ -2379,12 +2379,16 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
_BaseHTMLProcessor.handle_data(self, text) _BaseHTMLProcessor.handle_data(self, text)
def sanitize_style(self, style): def sanitize_style(self, style):
# disallow urls
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
# gauntlet # gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return '' if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
clean = [] clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
if not value: continue
if prop.lower() in self.acceptable_css_properties: if prop.lower() in self.acceptable_css_properties:
clean.append(prop + ': ' + value + ';') clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background','border','margin','padding']: elif prop.split('-')[0].lower() in ['background','border','margin','padding']:

View File

@ -18,6 +18,7 @@ from xml.sax.saxutils import escape
from xml.dom import minidom from xml.dom import minidom
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
from xml.parsers.expat import ExpatError from xml.parsers.expat import ExpatError
import planet
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -141,10 +142,9 @@ def content(xentry, name, detail, bozo):
xentry.appendChild(xcontent) xentry.appendChild(xcontent)
def source(xentry, source, bozo): def source(xsource, source, bozo):
""" copy source information to the entry """ """ copy source information to the entry """
xdoc = xentry.ownerDocument xdoc = xsource.ownerDocument
xsource = xdoc.createElement('source')
createTextElement(xsource, 'id', source.get('id', None)) createTextElement(xsource, 'id', source.get('id', None))
createTextElement(xsource, 'icon', source.get('icon', None)) createTextElement(xsource, 'icon', source.get('icon', None))
@ -164,16 +164,14 @@ def source(xentry, source, bozo):
# propagate planet inserted information # propagate planet inserted information
for key, value in source.items(): for key, value in source.items():
if key.startswith('planet:'): if key.startswith('planet_'):
createTextElement(xsource, key, value) createTextElement(xsource, key.replace('_',':',1), value)
xentry.appendChild(xsource)
def reconstitute(feed, entry): def reconstitute(feed, entry):
""" create an entry document from a parsed feed """ """ create an entry document from a parsed feed """
xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n') xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n')
xentry=xdoc.documentElement xentry=xdoc.documentElement
xentry.setAttribute('xmlns:planet','http://planet.intertwingly.net/') xentry.setAttribute('xmlns:planet',planet.xmlns)
id(xentry, entry) id(xentry, entry)
links(xentry, entry) links(xentry, entry)
@ -191,6 +189,8 @@ def reconstitute(feed, entry):
for contributor in entry.get('contributors',[]): for contributor in entry.get('contributors',[]):
author(xentry, 'contributor', contributor) author(xentry, 'contributor', contributor)
source(xentry, entry.get('source', feed.feed), bozo) xsource = xdoc.createElement('source')
source(xsource, entry.get('source', feed.feed), bozo)
xentry.appendChild(xsource)
return xdoc return xdoc

View File

@ -5,8 +5,9 @@ and write each as a set of entries in a cache directory.
# Standard library modules # Standard library modules
import time, calendar, re, os import time, calendar, re, os
from xml.dom import minidom
# Planet modules # Planet modules
import config, feedparser, reconstitute import planet, config, feedparser, reconstitute
try: try:
from xml.dom.ext import PrettyPrint from xml.dom.ext import PrettyPrint
@ -40,15 +41,45 @@ def filename(directory, filename):
return os.path.join(directory, filename) return os.path.join(directory, filename)
def write(xdoc, out):
""" write the document out to disk """
file = open(out,'w')
try:
PrettyPrint(xdoc, file)
except:
# known reasons for failure include no pretty printer installed,
# and absurdly high levels of markup nesting causing Python to
# declare infinite recursion.
file.seek(0)
file.write(xdoc.toxml('utf-8'))
file.close()
xdoc.unlink()
def spiderFeed(feed): def spiderFeed(feed):
""" Spider (fetch) a single feed """ """ Spider (fetch) a single feed """
data = feedparser.parse(feed) data = feedparser.parse(feed)
cache = config.cache_directory() if not data.feed: return
# capture data from the planet configuration file # capture feed and data from the planet configuration file
if not data.feed.has_key('links'): data.feed['links'] = list()
for link in data.feed.links:
if link.rel == 'self': break
else:
data.feed.links.append(feedparser.FeedParserDict(
{'rel':'self', 'type':'application/atom+xml', 'href':feed}))
for name, value in config.feed_options(feed).items(): for name, value in config.feed_options(feed).items():
data.feed['planet:'+name] = value data.feed['planet_'+name] = value
# write the feed info to the cache
sources = config.cache_sources_directory()
if not os.path.exists(sources): os.makedirs(sources)
xdoc=minidom.parseString('''<feed xmlns:planet="%s"
xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
reconstitute.source(xdoc.documentElement, data.feed, data.bozo)
write(xdoc, filename(sources, feed))
# write each entry to the cache
cache = config.cache_directory()
for entry in data.entries: for entry in data.entries:
if not entry.has_key('id'): if not entry.has_key('id'):
entry['id'] = reconstitute.id(None, entry) entry['id'] = reconstitute.id(None, entry)
@ -65,24 +96,11 @@ def spiderFeed(feed):
mtime = time.time() mtime = time.time()
entry['updated_parsed'] = time.gmtime(mtime) entry['updated_parsed'] = time.gmtime(mtime)
xml = reconstitute.reconstitute(data, entry) write(reconstitute.reconstitute(data, entry), out)
file = open(out,'w')
try:
PrettyPrint(reconstitute.reconstitute(data, entry), file)
except:
# known reasons for failure include no pretty printer installed,
# and absurdly high levels of markup nesting causing Python to
# declare infinite recursion.
file.seek(0)
file.write(reconstitute.reconstitute(data, entry).toxml('utf-8'))
file.close()
os.utime(out, (mtime, mtime)) os.utime(out, (mtime, mtime))
def spiderPlanet(configFile): def spiderPlanet(configFile):
""" Spider (fetch) an entire planet """ """ Spider (fetch) an entire planet """
import planet
config.load(configFile) config.load(configFile)
log = planet.getLogger(config.log_level()) log = planet.getLogger(config.log_level())
planet.setTimeout(config.feed_timeout()) planet.setTimeout(config.feed_timeout())

View File

@ -1,8 +1,9 @@
""" Splice together a planet from a cache of feed entries """ """ Splice together a planet from a cache of feed entries """
import glob, os import glob, os
from xml.dom import minidom from xml.dom import minidom
import config import planet, config, feedparser, reconstitute
from reconstitute import createTextElement from reconstitute import createTextElement
from spider import filename
def splice(configFile): def splice(configFile):
""" Splice together a planet from a cache of entries """ """ Splice together a planet from a cache of entries """
@ -11,7 +12,8 @@ def splice(configFile):
log = planet.getLogger(config.log_level()) log = planet.getLogger(config.log_level())
cache = config.cache_directory() cache = config.cache_directory()
dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")] dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
if not os.path.isdir(file)]
dir.sort() dir.sort()
dir.reverse() dir.reverse()
@ -34,17 +36,14 @@ def splice(configFile):
feed.appendChild(entry.documentElement) feed.appendChild(entry.documentElement)
# insert subscription information # insert subscription information
feed.setAttribute('xmlns:planet','http://planet.intertwingly.net/') feed.setAttribute('xmlns:planet',planet.xmlns)
sources = config.cache_sources_directory()
for sub in config.feeds(): for sub in config.feeds():
name = config.feed_options(sub).get('name','') data=feedparser.parse(filename(sources,sub))
xsub = doc.createElement('planet:subscription') if not data.feed: continue
xlink = doc.createElement('link') xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
xlink.setAttribute('rel','self') xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
xlink.setAttribute('href',sub.decode('utf-8')) reconstitute.source(xdoc.documentElement, data.feed, data.bozo)
xsub.appendChild(xlink) feed.appendChild(xdoc.documentElement)
xname = doc.createElement('planet:name')
xname.appendChild(doc.createTextNode(name.decode('utf-8')))
xsub.appendChild(xname)
feed.appendChild(xsub)
return doc return doc

1
spider.py Normal file → Executable file
View File

@ -1,3 +1,4 @@
#!/usr/bin/env python
""" """
Main program to run just the spider portion of planet Main program to run just the spider portion of planet
""" """

1
splice.py Normal file → Executable file
View File

@ -1,3 +1,4 @@
#!/usr/bin/env python
""" """
Main program to run just the splice portion of planet Main program to run just the splice portion of planet
""" """

View File

@ -2,6 +2,9 @@
cache_directory = tests/work/spider/cache cache_directory = tests/work/spider/cache
template_files = template_files =
[tests/data/spider/testfeed0.atom]
name = not found
[tests/data/spider/testfeed1b.atom] [tests/data/spider/testfeed1b.atom]
name = one name = one

15
tests/data/splice/cache/example.com,3 vendored Normal file
View File

@ -0,0 +1,15 @@
<?xml version='1.0' encoding='UTF-8'?>
<entry xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
<id>http://example.com/3</id>
<link href='http://example.com/3' type='text/html' rel='alternate'/>
<title>Earth</title>
<summary>the Blue Planet</summary>
<updated>2006-01-03T00:00:00Z</updated>
<source>
<link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
<link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
<subtitle>Its just data</subtitle>
<title>Sam Ruby</title>
<planet:name>three</planet:name>
</source>
</entry>

15
tests/data/splice/cache/example.com,4 vendored Normal file
View File

@ -0,0 +1,15 @@
<?xml version='1.0' encoding='UTF-8'?>
<entry xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
<id>http://example.com/4</id>
<link href='http://example.com/4' type='text/html' rel='alternate'/>
<title>Mars</title>
<summary>the Red Planet</summary>
<updated>2006-08-18T18:30:50Z</updated>
<source>
<link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
<link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
<subtitle>Its just data</subtitle>
<title>Sam Ruby</title>
<planet:name>three</planet:name>
</source>
</entry>

View File

@ -0,0 +1,15 @@
<?xml version='1.0' encoding='UTF-8'?>
<entry xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
<id>tag:planet.intertwingly.net,2006:testfeed3/1</id>
<link href='http://example.com/1' type='text/html' rel='alternate'/>
<title>Mercury</title>
<summary>Messenger of the Roman Gods</summary>
<updated>2006-01-01T00:00:00Z</updated>
<source>
<link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
<link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
<subtitle>Its just data</subtitle>
<title>Sam Ruby</title>
<planet:name>three</planet:name>
</source>
</entry>

View File

@ -0,0 +1,15 @@
<?xml version='1.0' encoding='UTF-8'?>
<entry xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
<id>tag:planet.intertwingly.net,2006:testfeed3/2</id>
<link href='http://example.com/2' type='text/html' rel='alternate'/>
<title>Venus</title>
<summary>the Morning Star</summary>
<updated>2006-08-18T18:30:50Z</updated>
<source>
<link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
<link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
<subtitle>Its just data</subtitle>
<title>Sam Ruby</title>
<planet:name>three</planet:name>
</source>
</entry>

View File

@ -0,0 +1,15 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
<id>tag:planet.intertwingly.net,2006:testfeed1</id>
<author>
<name>Sam Ruby</name>
<email>rubys@intertwingly.net</email>
<uri>http://www.intertwingly.net/blog/</uri>
</author>
<link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed1a.atom' type='application/atom+xml' rel='self'/>
<link href='http://www.intertwingly.net/blog/' type='text/html' rel='alternate'/>
<subtitle>Its just data</subtitle>
<title>Sam Ruby</title>
<updated>2006-06-17T00:15:18Z</updated>
<planet:name>one</planet:name>
</feed>

View File

@ -0,0 +1,15 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
<id>tag:planet.intertwingly.net,2006:testfeed2</id>
<author>
<name>Sam Ruby</name>
<email>rubys@intertwingly.net</email>
<uri>http://www.intertwingly.net/blog/</uri>
</author>
<link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed2.atom' type='application/atom+xml' rel='self'/>
<link href='http://www.intertwingly.net/blog/' type='text/html' rel='alternate'/>
<subtitle>Its just data</subtitle>
<title>Sam Ruby</title>
<updated>2006-06-17T00:15:18Z</updated>
<planet:name>two</planet:name>
</feed>

View File

@ -0,0 +1,8 @@
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
<link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
<link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
<subtitle>Its just data</subtitle>
<title>Sam Ruby</title>
<planet:name>three</planet:name>
</feed>

View File

@ -3,9 +3,14 @@ name = test planet
cache_directory = tests/data/splice/cache cache_directory = tests/data/splice/cache
template_files = template_files =
[tests/data/spider/testfeed0.atom]
name = not found
[tests/data/spider/testfeed1b.atom] [tests/data/spider/testfeed1b.atom]
name = one name = one
[tests/data/spider/testfeed2.atom] [tests/data/spider/testfeed2.atom]
name = two name = two
[tests/data/spider/testfeed3.rss]
name = three

View File

@ -17,6 +17,10 @@ class SpiderTest(unittest.TestCase):
os.makedirs(workdir) os.makedirs(workdir)
def tearDown(self): def tearDown(self):
for file in glob.glob(workdir+"/sources/*"):
os.unlink(file)
if os.path.exists(workdir+"/sources"):
os.rmdir(workdir+"/sources")
for file in glob.glob(workdir+"/*"): for file in glob.glob(workdir+"/*"):
os.unlink(file) os.unlink(file)
os.removedirs(workdir) os.removedirs(workdir)
@ -36,8 +40,8 @@ class SpiderTest(unittest.TestCase):
spiderFeed(testfeed % '1b') spiderFeed(testfeed % '1b')
files = glob.glob(workdir+"/*") files = glob.glob(workdir+"/*")
# verify that exactly four files were produced # verify that exactly four files + one sources dir were produced
self.assertEqual(4, len(files)) self.assertEqual(5, len(files))
# verify that the file names are as expected # verify that the file names are as expected
self.assertTrue(workdir + self.assertTrue(workdir +
@ -45,6 +49,7 @@ class SpiderTest(unittest.TestCase):
# verify that the file timestamps match atom:updated # verify that the file timestamps match atom:updated
for file in files: for file in files:
if file.endswith('/sources'): continue
data = feedparser.parse(file) data = feedparser.parse(file)
self.assertTrue(data.entries[0].source.planet_name) self.assertTrue(data.entries[0].source.planet_name)
self.assertEqual(os.stat(file).st_mtime, self.assertEqual(os.stat(file).st_mtime,
@ -58,8 +63,8 @@ class SpiderTest(unittest.TestCase):
spiderPlanet(configfile) spiderPlanet(configfile)
files = glob.glob(workdir+"/*") files = glob.glob(workdir+"/*")
# verify that exactly eight files were produced # verify that exactly eight files + 1 source dir were produced
self.assertEqual(12, len(files)) self.assertEqual(13, len(files))
# verify that the file names are as expected # verify that the file names are as expected
self.assertTrue(workdir + self.assertTrue(workdir +

View File

@ -9,9 +9,9 @@ class SpliceTest(unittest.TestCase):
def test_splice(self): def test_splice(self):
doc = splice(configfile) doc = splice(configfile)
self.assertEqual(8,len(doc.getElementsByTagName('entry'))) self.assertEqual(12,len(doc.getElementsByTagName('entry')))
self.assertEqual(2,len(doc.getElementsByTagName('planet:subscription'))) self.assertEqual(3,len(doc.getElementsByTagName('planet:source')))
self.assertEqual(10,len(doc.getElementsByTagName('planet:name'))) self.assertEqual(15,len(doc.getElementsByTagName('planet:name')))
self.assertEqual('test planet', self.assertEqual('test planet',
doc.getElementsByTagName('title')[0].firstChild.nodeValue) doc.getElementsByTagName('title')[0].firstChild.nodeValue)