diff --git a/THANKS b/THANKS index a4da15e..a58da71 100644 --- a/THANKS +++ b/THANKS @@ -9,7 +9,7 @@ Harry Fuecks - Pipe characters in file names, filter bug Eric van der Vlist - Filters to add language, category information Chris Dolan - mkdir cache; default template_dirs; fix xsltproc David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/ -Morten Fredericksen - Support WordPress LinkManager OPML +Morten Frederiksen - Support WordPress LinkManager OPML Harry Fuecks - default item date to feed date Antonio Cavedoni - Django templates diff --git a/docs/config.html b/docs/config.html index b1e6550..c6fb04e 100644 --- a/docs/config.html +++ b/docs/config.html @@ -111,6 +111,10 @@ no threads are used and spidering follows the traditional algorithm. directory to be used for an additional HTTP cache to front end the Venus cache. If specified as a relative path, it is evaluated relative to the cache_directory. +
cache_keep_entries
+
Used by expunge to determine how many entries should be +kept for each source when expunging old entries from the cache directory. +This may be overriden on a per subscription feed basis.

Additional options can be found in normalization level overrides.

diff --git a/expunge.py b/expunge.py new file mode 100644 index 0000000..ff5017a --- /dev/null +++ b/expunge.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +""" +Main program to run just the expunge portion of planet +""" + +import os.path +import sys +from planet import expunge, config + +if __name__ == '__main__': + + if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]): + config.load(sys.argv[1]) + expunge.expungeCache() + else: + print "Usage:" + print " python %s config.ini" % sys.argv[0] diff --git a/planet.py b/planet.py index a285f6c..c278c06 100755 --- a/planet.py +++ b/planet.py @@ -21,6 +21,7 @@ if __name__ == "__main__": offline = 0 verbose = 0 only_if_new = 0 + expunge = 0 for arg in sys.argv[1:]: if arg == "-h" or arg == "--help": @@ -31,6 +32,7 @@ if __name__ == "__main__": print " -o, --offline Update the Planet from the cache only" print " -h, --help Display this help message and exit" print " -n, --only-if-new Only spider new feeds" + print " -x, --expunge Expunge old entries from cache" print sys.exit(0) elif arg == "-v" or arg == "--verbose": @@ -39,6 +41,8 @@ if __name__ == "__main__": offline = 1 elif arg == "-n" or arg == "--only-if-new": only_if_new = 1 + elif arg == "-x" or arg == "--expunge": + expunge = 1 elif arg.startswith("-"): print >>sys.stderr, "Unknown option:", arg sys.exit(1) @@ -62,3 +66,7 @@ if __name__ == "__main__": from planet import splice doc = splice.splice() splice.apply(doc.toxml('utf-8')) + + if expunge: + from planet import expunge + expunge.expungeCache diff --git a/planet/config.py b/planet/config.py index 669dd68..afae785 100644 --- a/planet/config.py +++ b/planet/config.py @@ -107,6 +107,7 @@ def __init__(): define_planet('spider_threads', 0) define_planet_int('feed_timeout', 20) + define_planet_int('cache_keep_entries', 10) define_planet_list('template_files') define_planet_list('bill_of_materials') diff --git a/planet/expunge.py b/planet/expunge.py new file mode 100644 index 0000000..9f890b9 --- /dev/null +++ b/planet/expunge.py @@ -0,0 +1,68 @@ +""" Expunge old entries from a cache of entries """ +import glob, os, planet, config, feedparser +from xml.dom import minidom +from spider import filename + +def expungeCache(): + """ Expunge old entries from a cache of entries """ + import planet + log = planet.getLogger(config.log_level(),config.log_format()) + + log.info("Determining feed subscriptions") + entry_count = {} + sources = config.cache_sources_directory() + for sub in config.subscriptions(): + data=feedparser.parse(filename(sources,sub)) + if not data.feed.has_key('id'): continue + if config.feed_options(sub).has_key('cache_keep_entries'): + entry_count[data.feed.id] = int(config.feed_options(sub)['cache_keep_entries']) + else: + entry_count[data.feed.id] = config.cache_keep_entries() + + log.info("Listing cached entries") + cache = config.cache_directory() + dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*") + if not os.path.isdir(file)] + dir.sort() + dir.reverse() + + for mtime,file in dir: + + try: + entry=minidom.parse(file) + # determine source of entry + entry.normalize() + sources = entry.getElementsByTagName('source') + if not sources: + # no source determined, do not delete + log.debug("No source found for %s", file) + continue + ids = sources[0].getElementsByTagName('id') + if not ids: + # feed id not found, do not delete + log.debug("No source feed id found for %s", file) + continue + if ids[0].childNodes[0].nodeValue in entry_count: + # subscribed to feed, update entry count + entry_count[ids[0].childNodes[0].nodeValue] = entry_count[ + ids[0].childNodes[0].nodeValue] - 1 + if entry_count[ids[0].childNodes[0].nodeValue] >= 0: + # maximum not reached, do not delete + log.debug("Maximum not reached for %s from %s", + file, ids[0].childNodes[0].nodeValue) + continue + else: + # maximum reached + log.debug("Removing %s, maximum reached for %s", + file, ids[0].childNodes[0].nodeValue) + else: + # not subscribed + log.debug("Removing %s, not subscribed to %s", + file, ids[0].childNodes[0].nodeValue) + # remove old entry + os.unlink(file) + + except: + log.error("Error parsing %s", file) + +# end of expungeCache() diff --git a/tests/data/expunge/config.ini b/tests/data/expunge/config.ini new file mode 100644 index 0000000..ff750e0 --- /dev/null +++ b/tests/data/expunge/config.ini @@ -0,0 +1,20 @@ +[Planet] +name = test planet +cache_directory = tests/work/expunge/cache +cache_keep_entries = 1 + +[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1] +name = no source + +[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2] +name = no source id + +[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3] +name = global setting + +[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4] +name = local setting +cache_keep_entries = 2 + +#[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5] +#name = unsubbed diff --git a/tests/data/expunge/test1.entry b/tests/data/expunge/test1.entry new file mode 100644 index 0000000..1ef50ac --- /dev/null +++ b/tests/data/expunge/test1.entry @@ -0,0 +1,8 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test1/1 + + Test 1/1 + Entry with missing source + 2007-03-01T01:01:00Z + \ No newline at end of file diff --git a/tests/data/expunge/test2.entry b/tests/data/expunge/test2.entry new file mode 100644 index 0000000..b07ae59 --- /dev/null +++ b/tests/data/expunge/test2.entry @@ -0,0 +1,11 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test2/1 + + Test 2/1 + Entry with missing source id + 2007-03-01T02:01:00Z + + Test 2/1 source + + \ No newline at end of file diff --git a/tests/data/expunge/test3a.entry b/tests/data/expunge/test3a.entry new file mode 100644 index 0000000..af85d83 --- /dev/null +++ b/tests/data/expunge/test3a.entry @@ -0,0 +1,12 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/1 + + Test 3/1 + Entry for global setting 1 + 2007-03-01T03:01:00Z + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3 + Test 3 source + + \ No newline at end of file diff --git a/tests/data/expunge/test3b.entry b/tests/data/expunge/test3b.entry new file mode 100644 index 0000000..195cadd --- /dev/null +++ b/tests/data/expunge/test3b.entry @@ -0,0 +1,12 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/2 + + Test 3/2 + Entry for global setting 2 + 2007-03-01T03:02:00Z + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3 + Test 3 source + + \ No newline at end of file diff --git a/tests/data/expunge/test3c.entry b/tests/data/expunge/test3c.entry new file mode 100644 index 0000000..0f33eb0 --- /dev/null +++ b/tests/data/expunge/test3c.entry @@ -0,0 +1,12 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/3 + + Test 3/3 + Entry for global setting 3 + 2007-03-01T03:03:00Z + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3 + Test 3 source + + \ No newline at end of file diff --git a/tests/data/expunge/test4a.entry b/tests/data/expunge/test4a.entry new file mode 100644 index 0000000..744a83c --- /dev/null +++ b/tests/data/expunge/test4a.entry @@ -0,0 +1,12 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/1 + + Test 4/1 + Entry for local setting 1 + 2007-03-01T04:01:00Z + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4 + Test 4 source + + \ No newline at end of file diff --git a/tests/data/expunge/test4b.entry b/tests/data/expunge/test4b.entry new file mode 100644 index 0000000..4dcb6ba --- /dev/null +++ b/tests/data/expunge/test4b.entry @@ -0,0 +1,12 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/2 + + Test 4/2 + Entry for local setting 2 + 2007-03-01T04:02:00Z + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4 + Test 4 source + + \ No newline at end of file diff --git a/tests/data/expunge/test4c.entry b/tests/data/expunge/test4c.entry new file mode 100644 index 0000000..02fb184 --- /dev/null +++ b/tests/data/expunge/test4c.entry @@ -0,0 +1,12 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/3 + + Test 4/3 + Entry for local setting 3 + 2007-03-01T04:03:00Z + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4 + Test 4 source + + \ No newline at end of file diff --git a/tests/data/expunge/test5.entry b/tests/data/expunge/test5.entry new file mode 100644 index 0000000..96d338c --- /dev/null +++ b/tests/data/expunge/test5.entry @@ -0,0 +1,12 @@ + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-test5/1 + + Test 5/1 + Entry from unsubbed feed + 2007-03-01T05:01:00Z + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5 + Test 5 source + + \ No newline at end of file diff --git a/tests/data/expunge/testfeed1.atom b/tests/data/expunge/testfeed1.atom new file mode 100644 index 0000000..455803e --- /dev/null +++ b/tests/data/expunge/testfeed1.atom @@ -0,0 +1,5 @@ + + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1 + \ No newline at end of file diff --git a/tests/data/expunge/testfeed2.atom b/tests/data/expunge/testfeed2.atom new file mode 100644 index 0000000..58ae023 --- /dev/null +++ b/tests/data/expunge/testfeed2.atom @@ -0,0 +1,5 @@ + + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2 + \ No newline at end of file diff --git a/tests/data/expunge/testfeed3.atom b/tests/data/expunge/testfeed3.atom new file mode 100644 index 0000000..a8c111e --- /dev/null +++ b/tests/data/expunge/testfeed3.atom @@ -0,0 +1,5 @@ + + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3 + \ No newline at end of file diff --git a/tests/data/expunge/testfeed4.atom b/tests/data/expunge/testfeed4.atom new file mode 100644 index 0000000..10fb50d --- /dev/null +++ b/tests/data/expunge/testfeed4.atom @@ -0,0 +1,5 @@ + + + + tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4 + \ No newline at end of file diff --git a/tests/test_expunge.py b/tests/test_expunge.py new file mode 100644 index 0000000..fc93b52 --- /dev/null +++ b/tests/test_expunge.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +import unittest, os, glob, shutil, time +from planet.spider import filename +from planet import feedparser, config +from planet.expunge import expungeCache +from xml.dom import minidom +import planet + +workdir = 'tests/work/expunge/cache' +sourcesdir = 'tests/work/expunge/cache/sources' +testentries = 'tests/data/expunge/test*.entry' +testfeeds = 'tests/data/expunge/test*.atom' +configfile = 'tests/data/expunge/config.ini' + +class ExpungeTest(unittest.TestCase): + def setUp(self): + # silence errors + planet.logger = None + planet.getLogger('CRITICAL',None) + + try: + os.makedirs(workdir) + os.makedirs(sourcesdir) + except: + self.tearDown() + os.makedirs(workdir) + os.makedirs(sourcesdir) + + def tearDown(self): + shutil.rmtree(workdir) + os.removedirs(os.path.split(workdir)[0]) + + def test_expunge(self): + config.load(configfile) + + # create test entries in cache with correct timestamp + for entry in glob.glob(testentries): + e=minidom.parse(entry) + e.normalize() + eid = e.getElementsByTagName('id') + efile = filename(workdir, eid[0].childNodes[0].nodeValue) + eupdated = e.getElementsByTagName('updated')[0].childNodes[0].nodeValue + emtime = time.mktime(feedparser._parse_date_w3dtf(eupdated)) + if not eid or not eupdated: continue + shutil.copyfile(entry, efile) + os.utime(efile, (emtime, emtime)) + + # create test feeds in cache + sources = config.cache_sources_directory() + for feed in glob.glob(testfeeds): + f=minidom.parse(feed) + f.normalize() + fid = f.getElementsByTagName('id') + if not fid: continue + ffile = filename(sources, fid[0].childNodes[0].nodeValue) + shutil.copyfile(feed, ffile) + + # verify that exactly nine entries + one source dir were produced + files = glob.glob(workdir+"/*") + self.assertEqual(10, len(files)) + + # verify that exactly four feeds were produced in source dir + files = glob.glob(sources+"/*") + self.assertEqual(4, len(files)) + + # expunge... + expungeCache() + + # verify that five entries and one source dir are left + files = glob.glob(workdir+"/*") + self.assertEqual(6, len(files)) + + # verify that the right five entries are left + self.assertTrue(os.path.join(workdir, + 'bzr.mfd-consult.dk,2007,venus-expunge-test1,1') in files) + self.assertTrue(os.path.join(workdir, + 'bzr.mfd-consult.dk,2007,venus-expunge-test2,1') in files) + self.assertTrue(os.path.join(workdir, + 'bzr.mfd-consult.dk,2007,venus-expunge-test3,3') in files) + self.assertTrue(os.path.join(workdir, + 'bzr.mfd-consult.dk,2007,venus-expunge-test4,2') in files) + self.assertTrue(os.path.join(workdir, + 'bzr.mfd-consult.dk,2007,venus-expunge-test4,3') in files)