diff --git a/examples/planet.xslt b/examples/planet.xslt
index 420b0ea..c9bc464 100644
--- a/examples/planet.xslt
+++ b/examples/planet.xslt
@@ -19,13 +19,15 @@
Subscriptions
diff --git a/planet/__init__.py b/planet/__init__.py
index 7cade8c..d66a958 100644
--- a/planet/__init__.py
+++ b/planet/__init__.py
@@ -1,3 +1,5 @@
+xmlns = 'http://planet.intertwingly.net/'
+
logger = None
def getLogger(level):
diff --git a/planet/config.py b/planet/config.py
index 855d038..2e657af 100644
--- a/planet/config.py
+++ b/planet/config.py
@@ -26,7 +26,7 @@ Todo:
* error handling (example: no planet section)
"""
-import sys
+import os, sys
from ConfigParser import ConfigParser
parser = ConfigParser()
@@ -83,6 +83,12 @@ def template_files():
""" list the templates defined """
return parser.get('Planet','template_files').split(' ')
+def cache_sources_directory():
+ if parser.has_option('Planet', 'cache_sources_directory'):
+ parser.get('Planet', 'cache_sources_directory')
+ else:
+ return os.path.join(cache_directory(), 'sources')
+
def feeds():
""" list the feeds defined """
return filter(lambda feed: feed!='Planet' and feed not in template_files(),
diff --git a/planet/feedparser.py b/planet/feedparser.py
index 3ac0fda..a99c85a 100755
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec
"""
-__version__ = "4.2-pre-" + "$Revision: 1.131 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.132 $"[11:16] + "-cvs"
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@@ -2379,12 +2379,16 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
_BaseHTMLProcessor.handle_data(self, text)
def sanitize_style(self, style):
+ # disallow urls
+ style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
clean = []
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+ if not value: continue
if prop.lower() in self.acceptable_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index b8b090d..2f57726 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -18,6 +18,7 @@ from xml.sax.saxutils import escape
from xml.dom import minidom
from BeautifulSoup import BeautifulSoup
from xml.parsers.expat import ExpatError
+import planet
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@@ -141,10 +142,9 @@ def content(xentry, name, detail, bozo):
xentry.appendChild(xcontent)
-def source(xentry, source, bozo):
+def source(xsource, source, bozo):
""" copy source information to the entry """
- xdoc = xentry.ownerDocument
- xsource = xdoc.createElement('source')
+ xdoc = xsource.ownerDocument
createTextElement(xsource, 'id', source.get('id', None))
createTextElement(xsource, 'icon', source.get('icon', None))
@@ -164,16 +164,14 @@ def source(xentry, source, bozo):
# propagate planet inserted information
for key, value in source.items():
- if key.startswith('planet:'):
- createTextElement(xsource, key, value)
-
- xentry.appendChild(xsource)
+ if key.startswith('planet_'):
+ createTextElement(xsource, key.replace('_',':',1), value)
def reconstitute(feed, entry):
""" create an entry document from a parsed feed """
xdoc=minidom.parseString('\n')
xentry=xdoc.documentElement
- xentry.setAttribute('xmlns:planet','http://planet.intertwingly.net/')
+ xentry.setAttribute('xmlns:planet',planet.xmlns)
id(xentry, entry)
links(xentry, entry)
@@ -191,6 +189,8 @@ def reconstitute(feed, entry):
for contributor in entry.get('contributors',[]):
author(xentry, 'contributor', contributor)
- source(xentry, entry.get('source', feed.feed), bozo)
+ xsource = xdoc.createElement('source')
+ source(xsource, entry.get('source', feed.feed), bozo)
+ xentry.appendChild(xsource)
return xdoc
diff --git a/planet/spider.py b/planet/spider.py
index 454ef5c..f411463 100644
--- a/planet/spider.py
+++ b/planet/spider.py
@@ -5,8 +5,9 @@ and write each as a set of entries in a cache directory.
# Standard library modules
import time, calendar, re, os
+from xml.dom import minidom
# Planet modules
-import config, feedparser, reconstitute
+import planet, config, feedparser, reconstitute
try:
from xml.dom.ext import PrettyPrint
@@ -40,15 +41,45 @@ def filename(directory, filename):
return os.path.join(directory, filename)
+def write(xdoc, out):
+ """ write the document out to disk """
+ file = open(out,'w')
+ try:
+ PrettyPrint(xdoc, file)
+ except:
+ # known reasons for failure include no pretty printer installed,
+ # and absurdly high levels of markup nesting causing Python to
+ # declare infinite recursion.
+ file.seek(0)
+ file.write(xdoc.toxml('utf-8'))
+ file.close()
+ xdoc.unlink()
+
def spiderFeed(feed):
""" Spider (fetch) a single feed """
data = feedparser.parse(feed)
- cache = config.cache_directory()
+ if not data.feed: return
- # capture data from the planet configuration file
+ # capture feed and data from the planet configuration file
+ if not data.feed.has_key('links'): data.feed['links'] = list()
+ for link in data.feed.links:
+ if link.rel == 'self': break
+ else:
+ data.feed.links.append(feedparser.FeedParserDict(
+ {'rel':'self', 'type':'application/atom+xml', 'href':feed}))
for name, value in config.feed_options(feed).items():
- data.feed['planet:'+name] = value
+ data.feed['planet_'+name] = value
+ # write the feed info to the cache
+ sources = config.cache_sources_directory()
+ if not os.path.exists(sources): os.makedirs(sources)
+ xdoc=minidom.parseString('''\n''' % planet.xmlns)
+ reconstitute.source(xdoc.documentElement, data.feed, data.bozo)
+ write(xdoc, filename(sources, feed))
+
+ # write each entry to the cache
+ cache = config.cache_directory()
for entry in data.entries:
if not entry.has_key('id'):
entry['id'] = reconstitute.id(None, entry)
@@ -65,24 +96,11 @@ def spiderFeed(feed):
mtime = time.time()
entry['updated_parsed'] = time.gmtime(mtime)
- xml = reconstitute.reconstitute(data, entry)
-
- file = open(out,'w')
- try:
- PrettyPrint(reconstitute.reconstitute(data, entry), file)
- except:
- # known reasons for failure include no pretty printer installed,
- # and absurdly high levels of markup nesting causing Python to
- # declare infinite recursion.
- file.seek(0)
- file.write(reconstitute.reconstitute(data, entry).toxml('utf-8'))
- file.close()
-
+ write(reconstitute.reconstitute(data, entry), out)
os.utime(out, (mtime, mtime))
def spiderPlanet(configFile):
""" Spider (fetch) an entire planet """
- import planet
config.load(configFile)
log = planet.getLogger(config.log_level())
planet.setTimeout(config.feed_timeout())
diff --git a/planet/splice.py b/planet/splice.py
index cbc5740..55f9739 100644
--- a/planet/splice.py
+++ b/planet/splice.py
@@ -1,8 +1,9 @@
""" Splice together a planet from a cache of feed entries """
import glob, os
from xml.dom import minidom
-import config
+import planet, config, feedparser, reconstitute
from reconstitute import createTextElement
+from spider import filename
def splice(configFile):
""" Splice together a planet from a cache of entries """
@@ -11,7 +12,8 @@ def splice(configFile):
log = planet.getLogger(config.log_level())
cache = config.cache_directory()
- dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")]
+ dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
+ if not os.path.isdir(file)]
dir.sort()
dir.reverse()
@@ -34,17 +36,14 @@ def splice(configFile):
feed.appendChild(entry.documentElement)
# insert subscription information
- feed.setAttribute('xmlns:planet','http://planet.intertwingly.net/')
+ feed.setAttribute('xmlns:planet',planet.xmlns)
+ sources = config.cache_sources_directory()
for sub in config.feeds():
- name = config.feed_options(sub).get('name','')
- xsub = doc.createElement('planet:subscription')
- xlink = doc.createElement('link')
- xlink.setAttribute('rel','self')
- xlink.setAttribute('href',sub.decode('utf-8'))
- xsub.appendChild(xlink)
- xname = doc.createElement('planet:name')
- xname.appendChild(doc.createTextNode(name.decode('utf-8')))
- xsub.appendChild(xname)
- feed.appendChild(xsub)
+ data=feedparser.parse(filename(sources,sub))
+ if not data.feed: continue
+ xdoc=minidom.parseString('''\n''' % planet.xmlns)
+ reconstitute.source(xdoc.documentElement, data.feed, data.bozo)
+ feed.appendChild(xdoc.documentElement)
return doc
diff --git a/spider.py b/spider.py
old mode 100644
new mode 100755
index 39b73ee..e4aab2e
--- a/spider.py
+++ b/spider.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
"""
Main program to run just the spider portion of planet
"""
diff --git a/splice.py b/splice.py
old mode 100644
new mode 100755
index e98a559..e5ed424
--- a/splice.py
+++ b/splice.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
"""
Main program to run just the splice portion of planet
"""
diff --git a/tests/data/spider/config.ini b/tests/data/spider/config.ini
index b4b2075..7b38417 100644
--- a/tests/data/spider/config.ini
+++ b/tests/data/spider/config.ini
@@ -2,6 +2,9 @@
cache_directory = tests/work/spider/cache
template_files =
+[tests/data/spider/testfeed0.atom]
+name = not found
+
[tests/data/spider/testfeed1b.atom]
name = one
diff --git a/tests/data/splice/cache/example.com,3 b/tests/data/splice/cache/example.com,3
new file mode 100644
index 0000000..df0943b
--- /dev/null
+++ b/tests/data/splice/cache/example.com,3
@@ -0,0 +1,15 @@
+
+
+ http://example.com/3
+
+ Earth
+ the Blue Planet
+ 2006-01-03T00:00:00Z
+
+
+
+ It’s just data
+ Sam Ruby
+ three
+
+
diff --git a/tests/data/splice/cache/example.com,4 b/tests/data/splice/cache/example.com,4
new file mode 100644
index 0000000..bc229ff
--- /dev/null
+++ b/tests/data/splice/cache/example.com,4
@@ -0,0 +1,15 @@
+
+
+ http://example.com/4
+
+ Mars
+ the Red Planet
+ 2006-08-18T18:30:50Z
+
+
+
+ It’s just data
+ Sam Ruby
+ three
+
+
diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,1 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,1
similarity index 100%
rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,1
rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,1
diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,2 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,2
similarity index 100%
rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,2
rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,2
diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,3 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,3
similarity index 100%
rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,3
rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,3
diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,4 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,4
similarity index 100%
rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,4
rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed1,4
diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,1 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,1
similarity index 100%
rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,1
rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,1
diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,2 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,2
similarity index 100%
rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,2
rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,2
diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,3 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,3
similarity index 100%
rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,3
rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,3
diff --git a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,4 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,4
similarity index 100%
rename from tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,4
rename to tests/data/splice/cache/planet.intertwingly.net,2006,testfeed2,4
diff --git a/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1
new file mode 100644
index 0000000..5ca9f26
--- /dev/null
+++ b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1
@@ -0,0 +1,15 @@
+
+
+ tag:planet.intertwingly.net,2006:testfeed3/1
+
+ Mercury
+ Messenger of the Roman Gods
+ 2006-01-01T00:00:00Z
+
+
+
+ It’s just data
+ Sam Ruby
+ three
+
+
diff --git a/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2 b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2
new file mode 100644
index 0000000..f5acd6b
--- /dev/null
+++ b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2
@@ -0,0 +1,15 @@
+
+
+ tag:planet.intertwingly.net,2006:testfeed3/2
+
+ Venus
+ the Morning Star
+ 2006-08-18T18:30:50Z
+
+
+
+ It’s just data
+ Sam Ruby
+ three
+
+
diff --git a/tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom b/tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom
new file mode 100644
index 0000000..8cb9e5c
--- /dev/null
+++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom
@@ -0,0 +1,15 @@
+
+
+ tag:planet.intertwingly.net,2006:testfeed1
+
+ Sam Ruby
+ rubys@intertwingly.net
+ http://www.intertwingly.net/blog/
+
+
+
+ It’s just data
+ Sam Ruby
+ 2006-06-17T00:15:18Z
+ one
+
diff --git a/tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom b/tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom
new file mode 100644
index 0000000..6aeb0ab
--- /dev/null
+++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom
@@ -0,0 +1,15 @@
+
+
+ tag:planet.intertwingly.net,2006:testfeed2
+
+ Sam Ruby
+ rubys@intertwingly.net
+ http://www.intertwingly.net/blog/
+
+
+
+ It’s just data
+ Sam Ruby
+ 2006-06-17T00:15:18Z
+ two
+
diff --git a/tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss b/tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss
new file mode 100644
index 0000000..c464236
--- /dev/null
+++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss
@@ -0,0 +1,8 @@
+
+
+
+
+ It’s just data
+ Sam Ruby
+ three
+
diff --git a/tests/data/splice/config.ini b/tests/data/splice/config.ini
index f700e73..0ba74c3 100644
--- a/tests/data/splice/config.ini
+++ b/tests/data/splice/config.ini
@@ -3,9 +3,14 @@ name = test planet
cache_directory = tests/data/splice/cache
template_files =
+[tests/data/spider/testfeed0.atom]
+name = not found
+
[tests/data/spider/testfeed1b.atom]
name = one
[tests/data/spider/testfeed2.atom]
name = two
+[tests/data/spider/testfeed3.rss]
+name = three
diff --git a/tests/test_spider.py b/tests/test_spider.py
index 3bd9284..1f945e8 100644
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@@ -17,6 +17,10 @@ class SpiderTest(unittest.TestCase):
os.makedirs(workdir)
def tearDown(self):
+ for file in glob.glob(workdir+"/sources/*"):
+ os.unlink(file)
+ if os.path.exists(workdir+"/sources"):
+ os.rmdir(workdir+"/sources")
for file in glob.glob(workdir+"/*"):
os.unlink(file)
os.removedirs(workdir)
@@ -36,8 +40,8 @@ class SpiderTest(unittest.TestCase):
spiderFeed(testfeed % '1b')
files = glob.glob(workdir+"/*")
- # verify that exactly four files were produced
- self.assertEqual(4, len(files))
+ # verify that exactly four files + one sources dir were produced
+ self.assertEqual(5, len(files))
# verify that the file names are as expected
self.assertTrue(workdir +
@@ -45,6 +49,7 @@ class SpiderTest(unittest.TestCase):
# verify that the file timestamps match atom:updated
for file in files:
+ if file.endswith('/sources'): continue
data = feedparser.parse(file)
self.assertTrue(data.entries[0].source.planet_name)
self.assertEqual(os.stat(file).st_mtime,
@@ -58,8 +63,8 @@ class SpiderTest(unittest.TestCase):
spiderPlanet(configfile)
files = glob.glob(workdir+"/*")
- # verify that exactly eight files were produced
- self.assertEqual(12, len(files))
+ # verify that exactly eight files + 1 source dir were produced
+ self.assertEqual(13, len(files))
# verify that the file names are as expected
self.assertTrue(workdir +
diff --git a/tests/test_splice.py b/tests/test_splice.py
index 2b1a5ee..99fb446 100644
--- a/tests/test_splice.py
+++ b/tests/test_splice.py
@@ -9,9 +9,9 @@ class SpliceTest(unittest.TestCase):
def test_splice(self):
doc = splice(configfile)
- self.assertEqual(8,len(doc.getElementsByTagName('entry')))
- self.assertEqual(2,len(doc.getElementsByTagName('planet:subscription')))
- self.assertEqual(10,len(doc.getElementsByTagName('planet:name')))
+ self.assertEqual(12,len(doc.getElementsByTagName('entry')))
+ self.assertEqual(3,len(doc.getElementsByTagName('planet:source')))
+ self.assertEqual(15,len(doc.getElementsByTagName('planet:name')))
self.assertEqual('test planet',
doc.getElementsByTagName('title')[0].firstChild.nodeValue)