Updates from Sam Ruby.

2006-10-12 21:24:45 -05:00 · 2006-10-12 21:24:45 -05:00 · f940ab6af4
commit f940ab6af4
parent eb1dc357e2 db79be60cc
11 changed files with 175 additions and 104 deletions
--- a/planet/init.py
+++ b/planet/init.py
@ -52,91 +52,3 @@ def setTimeout(timeout):
                logger.info("Socket timeout set to %d seconds", timeout)
            else:
                logger.error("Unable to set timeout to %d seconds", timeout)
-
-def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=True):
-    global logger
-    try:
-
-        import urllib2, StringIO
-        from planet.spider import filename
-
-        # list cache file name
-        cache_filename = filename(config.cache_lists_directory(), list)
-
-        # retrieve list options (e.g., etag, last-modified) from cache
-        options = {}
-
-        # add original options
-        for key in orig_config.options(list):
-            options[key] = orig_config.get(list, key)
-            
-        try:
-            if use_cache:
-                cached_config = ConfigParser()
-                cached_config.read(cache_filename)
-                for option in cached_config.options(list):
-                     options[option] = cached_config.get(list,option)
-        except:
-            pass
-
-        cached_config = ConfigParser()
-        cached_config.add_section(list)
-        for key, value in options.items():
-            cached_config.set(list, key, value)
-
-        # read list
-        curdir=getattr(os.path, 'curdir', '.')
-        if sys.platform.find('win') < 0:
-            base = urljoin('file:', os.path.abspath(curdir))
-        else:
-            path = os.path.abspath(os.path.curdir)
-            base = urljoin('file:///', path.replace(':','|').replace('\\','/'))
-
-        request = urllib2.Request(urljoin(base + '/', list))
-        if options.has_key("etag"):
-            request.add_header('If-None-Match', options['etag'])
-        if options.has_key("last-modified"):
-            request.add_header('If-Modified-Since',
-                options['last-modified'])
-        response = urllib2.urlopen(request)
-        if response.headers.has_key('etag'):
-            cached_config.set(list, 'etag', response.headers['etag'])
-        if response.headers.has_key('last-modified'):
-            cached_config.set(list, 'last-modified',
-                response.headers['last-modified'])
-
-        # convert to config.ini
-        data = StringIO.StringIO(response.read())
-
-        if callback: callback(data, cached_config)
-
-        # write to cache
-        if use_cache:
-            cache = open(cache_filename, 'w')
-            cached_config.write(cache)
-            cache.close()
-
-        # re-parse and proceed
-        logger.debug("Using %s readinglist", list) 
-        if re_read:
-            if use_cache:  
-                orig_config.read(cache_filename)
-            else:
-                cdata = StringIO.StringIO()
-                cached_config.write(cdata)
-                cdata.seek(0)
-                orig_config.readfp(cdata)
-    except:
-        try:
-            if re_read:
-                if use_cache:  
-                    orig_config.read(cache_filename)
-                else:
-                    cdata = StringIO.StringIO()
-                    cached_config.write(cdata)
-                    cdata.seek(0)
-                    orig_config.readfp(cdata)
-                logger.info("Using cached %s readinglist", list)
-        except:
-            logger.exception("Unable to read %s readinglist", list)
-
--- a/planet/config.py
+++ b/planet/config.py
@ -182,7 +182,96 @@ def load(config_file):
                    raise Exception

        for list in reading_lists:
-            planet.downloadReadingList(list, parser, data2config)
+            downloadReadingList(list, parser, data2config)
+
+def downloadReadingList(list, orig_config, callback, use_cache=True, re_read=True):
+    from planet import logger
+    import config
+    try:
+
+        import urllib2, StringIO
+        from planet.spider import filename
+
+        # list cache file name
+        cache_filename = filename(config.cache_lists_directory(), list)
+
+        # retrieve list options (e.g., etag, last-modified) from cache
+        options = {}
+
+        # add original options
+        for key in orig_config.options(list):
+            options[key] = orig_config.get(list, key)
+            
+        try:
+            if use_cache:
+                cached_config = ConfigParser()
+                cached_config.read(cache_filename)
+                for option in cached_config.options(list):
+                     options[option] = cached_config.get(list,option)
+        except:
+            pass
+
+        cached_config = ConfigParser()
+        cached_config.add_section(list)
+        for key, value in options.items():
+            cached_config.set(list, key, value)
+
+        # read list
+        curdir=getattr(os.path, 'curdir', '.')
+        if sys.platform.find('win') < 0:
+            base = urljoin('file:', os.path.abspath(curdir))
+        else:
+            path = os.path.abspath(os.path.curdir)
+            base = urljoin('file:///', path.replace(':','|').replace('\\','/'))
+
+        request = urllib2.Request(urljoin(base + '/', list))
+        if options.has_key("etag"):
+            request.add_header('If-None-Match', options['etag'])
+        if options.has_key("last-modified"):
+            request.add_header('If-Modified-Since',
+                options['last-modified'])
+        response = urllib2.urlopen(request)
+        if response.headers.has_key('etag'):
+            cached_config.set(list, 'etag', response.headers['etag'])
+        if response.headers.has_key('last-modified'):
+            cached_config.set(list, 'last-modified',
+                response.headers['last-modified'])
+
+        # convert to config.ini
+        data = StringIO.StringIO(response.read())
+
+        if callback: callback(data, cached_config)
+
+        # write to cache
+        if use_cache:
+            cache = open(cache_filename, 'w')
+            cached_config.write(cache)
+            cache.close()
+
+        # re-parse and proceed
+        logger.debug("Using %s readinglist", list) 
+        if re_read:
+            if use_cache:  
+                orig_config.read(cache_filename)
+            else:
+                cdata = StringIO.StringIO()
+                cached_config.write(cdata)
+                cdata.seek(0)
+                orig_config.readfp(cdata)
+    except:
+        try:
+            if re_read:
+                if use_cache:  
+                    if not orig_config.read(cache_filename): raise Exception()
+                else:
+                    cdata = StringIO.StringIO()
+                    cached_config.write(cdata)
+                    cdata.seek(0)
+                    orig_config.readfp(cdata)
+                logger.info("Using cached %s readinglist", list)
+        except:
+            logger.exception("Unable to read %s readinglist", list)
+

 def cache_sources_directory():
    if parser.has_option('Planet', 'cache_sources_directory'):
--- a/planet/feedparser.py
+++ b/planet/feedparser.py
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
 """

-__version__ = "4.2-pre-" + "$Revision: 1.141 $"[11:16] + "-cvs"
+__version__ = "4.2-pre-" + "$Revision: 1.142 $"[11:16] + "-cvs"
 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
@ -2640,7 +2640,7 @@ def registerDateHandler(func):
 # 0301-04-01), so we use templates instead.
 # Please note the order in templates is significant because we need a
 # greedy match.
-_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
+_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
                'YY-?MM-?DD', 'YY-?OOO', 'YYYY', 
                '-YY-?MM', '-OOO', '-YY',
                '--MM-?DD', '--MM',
--- a/planet/foaf.py
+++ b/planet/foaf.py
@ -133,8 +133,8 @@ def foaf2config(rdf, config, subject=None):
                            { 'content_type' : 'foaf', 
                              'depth' : str(depth - 1) })
                try:
-                    import planet
-                    planet.downloadReadingList(seeAlso, config,
+                    from planet.config import downloadReadingList
+                    downloadReadingList(seeAlso, config,
                        lambda data, subconfig : friend2config(model, friend, seeAlso, subconfig, data), 
                        False)
                except:
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -202,6 +202,9 @@ def reconstitute(feed, entry):
    xentry=xdoc.documentElement
    xentry.setAttribute('xmlns:planet',planet.xmlns)

+    if entry.has_key('language'):
+        xentry.setAttribute('xml:lang', entry.language)
+
    id(xentry, entry)
    links(xentry, entry)

@ -225,7 +228,7 @@ def reconstitute(feed, entry):
        author(xentry, 'contributor', contributor)

    xsource = xdoc.createElement('source')
-    source(xsource, entry.get('source', feed.feed), bozo, feed.version)
+    source(xsource, entry.get('source') or feed.feed, bozo, feed.version)
    xentry.appendChild(xsource)

    return xdoc
--- a/planet/spider.py
+++ b/planet/spider.py
@ -34,6 +34,16 @@ def filename(directory, filename):
    filename = re_initial_cruft.sub("", filename)
    filename = re_final_cruft.sub("", filename)

+    # limit length of filename
+    if len(filename)>250:
+        parts=filename.split(',')
+        for i in range(len(parts),0,-1):
+            if len(','.join(parts[:i])) < 220:
+                import md5
+                filename = ','.join(parts[:i]) + ',' + \
+                    md5.new(','.join(parts[i:])).hexdigest()
+                break
+  
    return os.path.join(directory, filename)

 def write(xdoc, out):
--- a/planet/splice.py
+++ b/planet/splice.py
@ -17,7 +17,7 @@ def splice():
    dir.sort()
    dir.reverse()

-    items=max([config.items_per_page(templ)
+    max_items=max([config.items_per_page(templ)
        for templ in config.template_files() or ['Planet']])

    doc = minidom.parseString('<feed xmlns="http://www.w3.org/2005/Atom"/>')
@ -49,25 +49,40 @@ def splice():
        link.setAttribute('href', config.link())
        feed.appendChild(link)

-    # insert entry information
-    for mtime,file in dir[:items]:
-        try:
-            entry=minidom.parse(file)
-            feed.appendChild(entry.documentElement)
-        except:
-            log.error("Error parsing %s", file)
-
    # insert subscription information
+    sub_ids = []
    feed.setAttribute('xmlns:planet',planet.xmlns)
    sources = config.cache_sources_directory()
    for sub in config.subscriptions():
        data=feedparser.parse(filename(sources,sub))
+        if data.feed.has_key('id'): sub_ids.append(data.feed.id)
        if not data.feed: continue
        xdoc=minidom.parseString('''<planet:source xmlns:planet="%s"
             xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
        reconstitute.source(xdoc.documentElement, data.feed, None, None)
        feed.appendChild(xdoc.documentElement)

+    # insert entry information
+    items = 0
+    for mtime,file in dir:
+        try:
+            entry=minidom.parse(file)
+
+            # verify that this entry is currently subscribed to
+            entry.normalize()
+            sources = entry.getElementsByTagName('source')
+            if sources:
+                ids = sources[0].getElementsByTagName('id')
+                if ids and ids[0].childNodes[0].nodeValue not in sub_ids:
+                    continue
+
+            # add entry to feed
+            feed.appendChild(entry.documentElement)
+            items = items + 1
+            if items >= max_items: break
+        except:
+            log.error("Error parsing %s", file)
+
    return doc

 def apply(doc):
--- a/runtests.py
+++ b/runtests.py
@ -26,7 +26,12 @@ import planet
 planet.getLogger("WARNING")

 # load all of the tests into a suite
-suite = unittest.TestLoader().loadTestsFromNames(modules)
+try:
+    suite = unittest.TestLoader().loadTestsFromNames(modules)
+except Exception, exception:
+    # attempt to produce a more specific message
+    for module in modules: __import__(module)
+    raise

 # run test suite
 unittest.TextTestRunner().run(suite)
--- a/tests/data/reconstitute/dc_lang.xml
+++ b/tests/data/reconstitute/dc_lang.xml
@ -0,0 +1,14 @@
+<!--
+Description:  title value
+Expect:       title_detail.language == 'en-us'
+-->
+
+<rdf:RDF
+  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns="http://purl.org/rss/1.0/">
+<item>
+  <title>foo</title>
+  <dc:language>en-us</dc:language>
+</item>
+</rdf:RDF>
--- a/tests/data/reconstitute/rsssource.xml
+++ b/tests/data/reconstitute/rsssource.xml
@ -0,0 +1,15 @@
+<!--
+Description:  source element
+Expect:       source.title == 'foo' 
+-->
+
+<rss version="2.0">
+  <channel>
+    <title>foo</title>
+    <item>
+      <guid>http://example.com/1</guid>
+      <source url="http://www.example.org">org</source>
+    </item>
+  </channel>
+</rss>
+
--- a/tests/test_splice.py
+++ b/tests/test_splice.py
@ -16,3 +16,11 @@ class SpliceTest(unittest.TestCase):

        self.assertEqual('test planet',
            doc.getElementsByTagName('title')[0].firstChild.nodeValue)
+
+    def test_splice_unsub(self):
+        config.load(configfile)
+        config.parser.remove_section('tests/data/spider/testfeed2.atom')
+        doc = splice()
+        self.assertEqual(8,len(doc.getElementsByTagName('entry')))
+        self.assertEqual(3,len(doc.getElementsByTagName('planet:source')))
+        self.assertEqual(11,len(doc.getElementsByTagName('planet:name')))