Change planet:subscription to planet:source, and expand the information

provided in each.
2006-08-18 15:47:10 -04:00 · 2006-08-18 15:47:10 -04:00 · 9fa9fb6117
commit 9fa9fb6117
parent 6c0e24fd00
28 changed files with 195 additions and 51 deletions
--- a/examples/planet.xslt
+++ b/examples/planet.xslt
@ -19,13 +19,15 @@

          <h2>Subscriptions</h2>
          <ul>
-            <xsl:for-each select="planet:subscription">
+            <xsl:for-each select="planet:source">
              <xsl:sort select="planet:name"/>
              <li>
                <a href="{atom:link[@rel='self']/@href}" title="subscribe">
                  <img src="images/feed-icon-10x10.png" alt="(feed)"/>
                </a>
-                <xsl:value-of select="planet:name"/>
+                <a href="{atom:link[@rel='alternate']/@href}">
+                  <xsl:value-of select="planet:name"/>
+                </a>
              </li>
            </xsl:for-each>
          </ul>
--- a/planet/init.py
+++ b/planet/init.py
@ -1,3 +1,5 @@
+xmlns = 'http://planet.intertwingly.net/'
+
 logger = None

 def getLogger(level):
--- a/planet/config.py
+++ b/planet/config.py
@ -26,7 +26,7 @@ Todo:
  * error handling (example: no planet section)
 """

-import sys
+import os, sys
 from ConfigParser import ConfigParser

 parser = ConfigParser()
@ -83,6 +83,12 @@ def template_files():
    """ list the templates defined """
    return parser.get('Planet','template_files').split(' ')

+def cache_sources_directory():
+    if parser.has_option('Planet', 'cache_sources_directory'):
+        parser.get('Planet', 'cache_sources_directory')
+    else:
+        return os.path.join(cache_directory(), 'sources')
+
 def feeds():
    """ list the feeds defined """
    return filter(lambda feed: feed!='Planet' and feed not in template_files(),
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """

-__version__ = "4.2-pre-" + "$Revision: 1.131 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.132 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
@ -2379,12 +2379,16 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
            _BaseHTMLProcessor.handle_data(self, text)

    def sanitize_style(self, style):
+        # disallow urls
+        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
        # gauntlet
        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''

        clean = []
        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+          if not value: continue
          if prop.lower() in self.acceptable_css_properties:
              clean.append(prop + ': ' + value + ';')
          elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -18,6 +18,7 @@ from xml.sax.saxutils import escape
 from xml.dom import minidom
 from BeautifulSoup import BeautifulSoup
 from xml.parsers.expat import ExpatError
+import planet

 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")

@ -141,10 +142,9 @@ def content(xentry, name, detail, bozo):

    xentry.appendChild(xcontent)

-def source(xentry, source, bozo):
+def source(xsource, source, bozo):
    """ copy source information to the entry """
-    xdoc = xentry.ownerDocument
-    xsource = xdoc.createElement('source')
+    xdoc = xsource.ownerDocument

    createTextElement(xsource, 'id', source.get('id', None))
    createTextElement(xsource, 'icon', source.get('icon', None))
@ -164,16 +164,14 @@ def source(xentry, source, bozo):

    # propagate planet inserted information
    for key, value in source.items():
-        if key.startswith('planet:'):
-            createTextElement(xsource, key, value)
-
-    xentry.appendChild(xsource)
+        if key.startswith('planet_'):
+            createTextElement(xsource, key.replace('_',':',1), value)

 def reconstitute(feed, entry):
    """ create an entry document from a parsed feed """
    xdoc=minidom.parseString('<entry xmlns="http://www.w3.org/2005/Atom"/>\n')
    xentry=xdoc.documentElement
-    xentry.setAttribute('xmlns:planet','http://planet.intertwingly.net/')
+    xentry.setAttribute('xmlns:planet',planet.xmlns)

    id(xentry, entry)
    links(xentry, entry)
@ -191,6 +189,8 @@ def reconstitute(feed, entry):
    for contributor in entry.get('contributors',[]):
        author(xentry, 'contributor', contributor)

-    source(xentry, entry.get('source', feed.feed), bozo)
+    xsource = xdoc.createElement('source')
+    source(xsource, entry.get('source', feed.feed), bozo)
+    xentry.appendChild(xsource)

    return xdoc
--- a/planet/spider.py
+++ b/planet/spider.py
@ -5,8 +5,9 @@ and write each as a set of entries in a cache directory.

 # Standard library modules
 import time, calendar, re, os
+from xml.dom import minidom
 # Planet modules
-import config, feedparser, reconstitute
+import planet, config, feedparser, reconstitute

 try:
    from xml.dom.ext import PrettyPrint
@ -40,15 +41,45 @@ def filename(directory, filename):

    return os.path.join(directory, filename)

+def write(xdoc, out):
+    """ write the document out to disk """
+    file = open(out,'w')
+    try:
+        PrettyPrint(xdoc, file)
+    except:
+        # known reasons for failure include no pretty printer installed,
+        # and absurdly high levels of markup nesting causing Python to
+        # declare infinite recursion.
+        file.seek(0)
+        file.write(xdoc.toxml('utf-8'))
+    file.close()
+    xdoc.unlink()
+
 def spiderFeed(feed):
    """ Spider (fetch) a single feed """
    data = feedparser.parse(feed)
-    cache = config.cache_directory()
+    if not data.feed: return

-    # capture data from the planet configuration file
+    # capture feed and data from the planet configuration file
+    if not data.feed.has_key('links'): data.feed['links'] = list()
+    for link in data.feed.links:
+        if link.rel == 'self': break
+    else:
+        data.feed.links.append(feedparser.FeedParserDict(
+            {'rel':'self', 'type':'application/atom+xml', 'href':feed}))
    for name, value in config.feed_options(feed).items():
-        data.feed['planet:'+name] = value
+        data.feed['planet_'+name] = value
    
+    # write the feed info to the cache
+    sources = config.cache_sources_directory()
+    if not os.path.exists(sources): os.makedirs(sources)
+    xdoc=minidom.parseString('''<feed xmlns:planet="%s"
+      xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
+    reconstitute.source(xdoc.documentElement, data.feed, data.bozo)
+    write(xdoc, filename(sources, feed))
+
+    # write each entry to the cache
+    cache = config.cache_directory()
    for entry in data.entries:
        if not entry.has_key('id'):
            entry['id'] = reconstitute.id(None, entry)
@ -65,24 +96,11 @@ def spiderFeed(feed):
                mtime = time.time()
            entry['updated_parsed'] = time.gmtime(mtime)

-        xml = reconstitute.reconstitute(data, entry)
-        
-        file = open(out,'w')
-        try:
-            PrettyPrint(reconstitute.reconstitute(data, entry), file)
-        except:
-            # known reasons for failure include no pretty printer installed,
-            # and absurdly high levels of markup nesting causing Python to
-            # declare infinite recursion.
-            file.seek(0)
-            file.write(reconstitute.reconstitute(data, entry).toxml('utf-8'))
-        file.close()
-
+        write(reconstitute.reconstitute(data, entry), out) 
        os.utime(out, (mtime, mtime))

 def spiderPlanet(configFile):
    """ Spider (fetch) an entire planet """
-    import planet
    config.load(configFile)
    log = planet.getLogger(config.log_level())
    planet.setTimeout(config.feed_timeout())
--- a/planet/splice.py
+++ b/planet/splice.py
@ -1,8 +1,9 @@
 """ Splice together a planet from a cache of feed entries """
 import glob, os
 from xml.dom import minidom
-import config
+import planet, config, feedparser, reconstitute
 from reconstitute import createTextElement
+from spider import filename

 def splice(configFile):
    """ Splice together a planet from a cache of entries """
@ -11,7 +12,8 @@ def splice(configFile):
    log = planet.getLogger(config.log_level())

    cache = config.cache_directory()
-    dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")]
+    dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
+        if not os.path.isdir(file)]
    dir.sort()
    dir.reverse()

@ -34,17 +36,14 @@ def splice(configFile):
        feed.appendChild(entry.documentElement)

    # insert subscription information
-    feed.setAttribute('xmlns:planet','http://planet.intertwingly.net/')
+    feed.setAttribute('xmlns:planet',planet.xmlns)
+    sources = config.cache_sources_directory()
    for sub in config.feeds():
-        name = config.feed_options(sub).get('name','')
-        xsub = doc.createElement('planet:subscription')
-        xlink = doc.createElement('link')
-        xlink.setAttribute('rel','self')
-        xlink.setAttribute('href',sub.decode('utf-8'))
-        xsub.appendChild(xlink)
-        xname = doc.createElement('planet:name')
-        xname.appendChild(doc.createTextNode(name.decode('utf-8')))
-        xsub.appendChild(xname)
-        feed.appendChild(xsub)
+        data=feedparser.parse(filename(sources,sub))
+        if not data.feed: continue
+        xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
+             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
+        reconstitute.source(xdoc.documentElement, data.feed, data.bozo)
+        feed.appendChild(xdoc.documentElement)

    return doc
--- a/spider.py
+++ b/spider.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Main program to run just the spider portion of planet
 """
--- a/splice.py
+++ b/splice.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Main program to run just the splice portion of planet
 """
--- a/tests/data/spider/config.ini
+++ b/tests/data/spider/config.ini
@ -2,6 +2,9 @@
 cache_directory = tests/work/spider/cache
 template_files = 

+[tests/data/spider/testfeed0.atom]
+name = not found
+
 [tests/data/spider/testfeed1b.atom]
 name = one

--- a/tests/data/splice/cache/example.com,3
+++ b/tests/data/splice/cache/example.com,3
@ -0,0 +1,15 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<entry xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
+  <id>http://example.com/3</id>
+  <link href='http://example.com/3' type='text/html' rel='alternate'/>
+  <title>Earth</title>
+  <summary>the Blue Planet</summary>
+  <updated>2006-01-03T00:00:00Z</updated>
+  <source>
+    <link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
+    <link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
+    <subtitle>It’s just data</subtitle>
+    <title>Sam Ruby</title>
+    <planet:name>three</planet:name>
+  </source>
+</entry>
--- a/tests/data/splice/cache/example.com,4
+++ b/tests/data/splice/cache/example.com,4
@ -0,0 +1,15 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<entry xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
+  <id>http://example.com/4</id>
+  <link href='http://example.com/4' type='text/html' rel='alternate'/>
+  <title>Mars</title>
+  <summary>the Red Planet</summary>
+  <updated>2006-08-18T18:30:50Z</updated>
+  <source>
+    <link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
+    <link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
+    <subtitle>It’s just data</subtitle>
+    <title>Sam Ruby</title>
+    <planet:name>three</planet:name>
+  </source>
+</entry>
--- a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,1
+++ b/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,1
--- a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,2
+++ b/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,2
--- a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,3
+++ b/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,3
--- a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,4
+++ b/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed1,4
--- a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,1
+++ b/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,1
--- a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,2
+++ b/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,2
--- a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,3
+++ b/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,3
--- a/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,4
+++ b/tests/data/splice/cache/tag:planet.intertwingly.net,2006:testfeed2,4
--- a/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1
+++ b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,1
@ -0,0 +1,15 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<entry xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
+  <id>tag:planet.intertwingly.net,2006:testfeed3/1</id>
+  <link href='http://example.com/1' type='text/html' rel='alternate'/>
+  <title>Mercury</title>
+  <summary>Messenger of the Roman Gods</summary>
+  <updated>2006-01-01T00:00:00Z</updated>
+  <source>
+    <link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
+    <link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
+    <subtitle>It’s just data</subtitle>
+    <title>Sam Ruby</title>
+    <planet:name>three</planet:name>
+  </source>
+</entry>
--- a/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2
+++ b/tests/data/splice/cache/planet.intertwingly.net,2006,testfeed3,2
@ -0,0 +1,15 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<entry xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
+  <id>tag:planet.intertwingly.net,2006:testfeed3/2</id>
+  <link href='http://example.com/2' type='text/html' rel='alternate'/>
+  <title>Venus</title>
+  <summary>the Morning Star</summary>
+  <updated>2006-08-18T18:30:50Z</updated>
+  <source>
+    <link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
+    <link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
+    <subtitle>It’s just data</subtitle>
+    <title>Sam Ruby</title>
+    <planet:name>three</planet:name>
+  </source>
+</entry>
--- a/tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom
+++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed1b.atom
@ -0,0 +1,15 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<feed xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
+  <id>tag:planet.intertwingly.net,2006:testfeed1</id>
+  <author>
+    <name>Sam Ruby</name>
+    <email>rubys@intertwingly.net</email>
+    <uri>http://www.intertwingly.net/blog/</uri>
+  </author>
+  <link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed1a.atom' type='application/atom+xml' rel='self'/>
+  <link href='http://www.intertwingly.net/blog/' type='text/html' rel='alternate'/>
+  <subtitle>It’s just data</subtitle>
+  <title>Sam Ruby</title>
+  <updated>2006-06-17T00:15:18Z</updated>
+  <planet:name>one</planet:name>
+</feed>
--- a/tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom
+++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed2.atom
@ -0,0 +1,15 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<feed xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
+  <id>tag:planet.intertwingly.net,2006:testfeed2</id>
+  <author>
+    <name>Sam Ruby</name>
+    <email>rubys@intertwingly.net</email>
+    <uri>http://www.intertwingly.net/blog/</uri>
+  </author>
+  <link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed2.atom' type='application/atom+xml' rel='self'/>
+  <link href='http://www.intertwingly.net/blog/' type='text/html' rel='alternate'/>
+  <subtitle>It’s just data</subtitle>
+  <title>Sam Ruby</title>
+  <updated>2006-06-17T00:15:18Z</updated>
+  <planet:name>two</planet:name>
+</feed>
--- a/tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss
+++ b/tests/data/splice/cache/sources/tests,data,spider,testfeed3.rss
@ -0,0 +1,8 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<feed xmlns='http://www.w3.org/2005/Atom' xmlns:planet='http://planet.intertwingly.net/'>
+  <link href='http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss' type='text/html' rel='alternate'/>
+  <link href='tests/data/spider/testfeed3.rss' type='application/atom+xml' rel='self'/>
+  <subtitle>It’s just data</subtitle>
+  <title>Sam Ruby</title>
+  <planet:name>three</planet:name>
+</feed>
--- a/tests/data/splice/config.ini
+++ b/tests/data/splice/config.ini
@ -3,9 +3,14 @@ name = test planet
 cache_directory = tests/data/splice/cache
 template_files = 

+[tests/data/spider/testfeed0.atom]
+name = not found
+
 [tests/data/spider/testfeed1b.atom]
 name = one

 [tests/data/spider/testfeed2.atom]
 name = two

+[tests/data/spider/testfeed3.rss]
+name = three
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -17,6 +17,10 @@ class SpiderTest(unittest.TestCase):
             os.makedirs(workdir)
    
    def tearDown(self):
+        for file in glob.glob(workdir+"/sources/*"):
+             os.unlink(file)
+        if os.path.exists(workdir+"/sources"):
+            os.rmdir(workdir+"/sources")
        for file in glob.glob(workdir+"/*"):
             os.unlink(file)
        os.removedirs(workdir)
@ -36,8 +40,8 @@ class SpiderTest(unittest.TestCase):
        spiderFeed(testfeed % '1b')
        files = glob.glob(workdir+"/*")

-        # verify that exactly four files were produced
-        self.assertEqual(4, len(files))
+        # verify that exactly four files + one sources dir were produced
+        self.assertEqual(5, len(files))

        # verify that the file names are as expected
        self.assertTrue(workdir + 
@ -45,6 +49,7 @@ class SpiderTest(unittest.TestCase):

        # verify that the file timestamps match atom:updated
        for file in files:
+            if file.endswith('/sources'): continue
            data = feedparser.parse(file)
            self.assertTrue(data.entries[0].source.planet_name)
            self.assertEqual(os.stat(file).st_mtime,
@ -58,8 +63,8 @@ class SpiderTest(unittest.TestCase):
        spiderPlanet(configfile)
        files = glob.glob(workdir+"/*")

-        # verify that exactly eight files were produced
-        self.assertEqual(12, len(files))
+        # verify that exactly eight files + 1 source dir were produced
+        self.assertEqual(13, len(files))

        # verify that the file names are as expected
        self.assertTrue(workdir + 
--- a/tests/test_splice.py
+++ b/tests/test_splice.py
@ -9,9 +9,9 @@ class SpliceTest(unittest.TestCase):

    def test_splice(self):
        doc = splice(configfile)
-        self.assertEqual(8,len(doc.getElementsByTagName('entry')))
-        self.assertEqual(2,len(doc.getElementsByTagName('planet:subscription')))
-        self.assertEqual(10,len(doc.getElementsByTagName('planet:name')))
+        self.assertEqual(12,len(doc.getElementsByTagName('entry')))
+        self.assertEqual(3,len(doc.getElementsByTagName('planet:source')))
+        self.assertEqual(15,len(doc.getElementsByTagName('planet:name')))

        self.assertEqual('test planet',
            doc.getElementsByTagName('title')[0].firstChild.nodeValue)