Added expunge and test cases

This commit is contained in:
Morten Frederiksen 2007-03-04 15:23:20 +01:00
commit 5f21b167ff
21 changed files with 325 additions and 1 deletions

2
THANKS
View File

@ -9,7 +9,7 @@ Harry Fuecks - Pipe characters in file names, filter bug
Eric van der Vlist - Filters to add language, category information Eric van der Vlist - Filters to add language, category information
Chris Dolan - mkdir cache; default template_dirs; fix xsltproc Chris Dolan - mkdir cache; default template_dirs; fix xsltproc
David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/ David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/
Morten Fredericksen - Support WordPress LinkManager OPML Morten Frederiksen - Support WordPress LinkManager OPML
Harry Fuecks - default item date to feed date Harry Fuecks - default item date to feed date
Antonio Cavedoni - Django templates Antonio Cavedoni - Django templates

View File

@ -111,6 +111,10 @@ no threads are used and spidering follows the traditional algorithm.</dd>
directory to be used for an additional HTTP cache to front end the Venus directory to be used for an additional HTTP cache to front end the Venus
cache. If specified as a relative path, it is evaluated relative to the cache. If specified as a relative path, it is evaluated relative to the
<code>cache_directory</code>.</dd> <code>cache_directory</code>.</dd>
<dt><ins>cache_keep_entries</ins></dt>
<dd>Used by <code>expunge</code> to determine how many entries should be
kept for each source when expunging old entries from the cache directory.
This may be overriden on a per subscription feed basis.</dd>
</dl> </dl>
<p>Additional options can be found in <p>Additional options can be found in
<a href="normalization.html#overrides">normalization level overrides</a>.</p> <a href="normalization.html#overrides">normalization level overrides</a>.</p>

17
expunge.py Normal file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env python
"""
Main program to run just the expunge portion of planet
"""
import os.path
import sys
from planet import expunge, config
if __name__ == '__main__':
if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
config.load(sys.argv[1])
expunge.expungeCache()
else:
print "Usage:"
print " python %s config.ini" % sys.argv[0]

View File

@ -21,6 +21,7 @@ if __name__ == "__main__":
offline = 0 offline = 0
verbose = 0 verbose = 0
only_if_new = 0 only_if_new = 0
expunge = 0
for arg in sys.argv[1:]: for arg in sys.argv[1:]:
if arg == "-h" or arg == "--help": if arg == "-h" or arg == "--help":
@ -31,6 +32,7 @@ if __name__ == "__main__":
print " -o, --offline Update the Planet from the cache only" print " -o, --offline Update the Planet from the cache only"
print " -h, --help Display this help message and exit" print " -h, --help Display this help message and exit"
print " -n, --only-if-new Only spider new feeds" print " -n, --only-if-new Only spider new feeds"
print " -x, --expunge Expunge old entries from cache"
print print
sys.exit(0) sys.exit(0)
elif arg == "-v" or arg == "--verbose": elif arg == "-v" or arg == "--verbose":
@ -39,6 +41,8 @@ if __name__ == "__main__":
offline = 1 offline = 1
elif arg == "-n" or arg == "--only-if-new": elif arg == "-n" or arg == "--only-if-new":
only_if_new = 1 only_if_new = 1
elif arg == "-x" or arg == "--expunge":
expunge = 1
elif arg.startswith("-"): elif arg.startswith("-"):
print >>sys.stderr, "Unknown option:", arg print >>sys.stderr, "Unknown option:", arg
sys.exit(1) sys.exit(1)
@ -62,3 +66,7 @@ if __name__ == "__main__":
from planet import splice from planet import splice
doc = splice.splice() doc = splice.splice()
splice.apply(doc.toxml('utf-8')) splice.apply(doc.toxml('utf-8'))
if expunge:
from planet import expunge
expunge.expungeCache

View File

@ -107,6 +107,7 @@ def __init__():
define_planet('spider_threads', 0) define_planet('spider_threads', 0)
define_planet_int('feed_timeout', 20) define_planet_int('feed_timeout', 20)
define_planet_int('cache_keep_entries', 10)
define_planet_list('template_files') define_planet_list('template_files')
define_planet_list('bill_of_materials') define_planet_list('bill_of_materials')

68
planet/expunge.py Normal file
View File

@ -0,0 +1,68 @@
""" Expunge old entries from a cache of entries """
import glob, os, planet, config, feedparser
from xml.dom import minidom
from spider import filename
def expungeCache():
""" Expunge old entries from a cache of entries """
import planet
log = planet.getLogger(config.log_level(),config.log_format())
log.info("Determining feed subscriptions")
entry_count = {}
sources = config.cache_sources_directory()
for sub in config.subscriptions():
data=feedparser.parse(filename(sources,sub))
if not data.feed.has_key('id'): continue
if config.feed_options(sub).has_key('cache_keep_entries'):
entry_count[data.feed.id] = int(config.feed_options(sub)['cache_keep_entries'])
else:
entry_count[data.feed.id] = config.cache_keep_entries()
log.info("Listing cached entries")
cache = config.cache_directory()
dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
if not os.path.isdir(file)]
dir.sort()
dir.reverse()
for mtime,file in dir:
try:
entry=minidom.parse(file)
# determine source of entry
entry.normalize()
sources = entry.getElementsByTagName('source')
if not sources:
# no source determined, do not delete
log.debug("No source found for %s", file)
continue
ids = sources[0].getElementsByTagName('id')
if not ids:
# feed id not found, do not delete
log.debug("No source feed id found for %s", file)
continue
if ids[0].childNodes[0].nodeValue in entry_count:
# subscribed to feed, update entry count
entry_count[ids[0].childNodes[0].nodeValue] = entry_count[
ids[0].childNodes[0].nodeValue] - 1
if entry_count[ids[0].childNodes[0].nodeValue] >= 0:
# maximum not reached, do not delete
log.debug("Maximum not reached for %s from %s",
file, ids[0].childNodes[0].nodeValue)
continue
else:
# maximum reached
log.debug("Removing %s, maximum reached for %s",
file, ids[0].childNodes[0].nodeValue)
else:
# not subscribed
log.debug("Removing %s, not subscribed to %s",
file, ids[0].childNodes[0].nodeValue)
# remove old entry
os.unlink(file)
except:
log.error("Error parsing %s", file)
# end of expungeCache()

View File

@ -0,0 +1,20 @@
[Planet]
name = test planet
cache_directory = tests/work/expunge/cache
cache_keep_entries = 1
[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1]
name = no source
[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2]
name = no source id
[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3]
name = global setting
[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4]
name = local setting
cache_keep_entries = 2
#[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5]
#name = unsubbed

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test1/1</id>
<link href="http://example.com/1/1"/>
<title>Test 1/1</title>
<content>Entry with missing source</content>
<updated>2007-03-01T01:01:00Z</updated>
</entry>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test2/1</id>
<link href="http://example.com/2/1"/>
<title>Test 2/1</title>
<content>Entry with missing source id</content>
<updated>2007-03-01T02:01:00Z</updated>
<source>
<title>Test 2/1 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/1</id>
<link href="http://example.com/3/1"/>
<title>Test 3/1</title>
<content>Entry for global setting 1</content>
<updated>2007-03-01T03:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/2</id>
<link href="http://example.com/3/2"/>
<title>Test 3/2</title>
<content>Entry for global setting 2</content>
<updated>2007-03-01T03:02:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/3</id>
<link href="http://example.com/3/3"/>
<title>Test 3/3</title>
<content>Entry for global setting 3</content>
<updated>2007-03-01T03:03:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/1</id>
<link href="http://example.com/4/1"/>
<title>Test 4/1</title>
<content>Entry for local setting 1</content>
<updated>2007-03-01T04:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/2</id>
<link href="http://example.com/4/2"/>
<title>Test 4/2</title>
<content>Entry for local setting 2</content>
<updated>2007-03-01T04:02:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/3</id>
<link href="http://example.com/4/3"/>
<title>Test 4/3</title>
<content>Entry for local setting 3</content>
<updated>2007-03-01T04:03:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test5/1</id>
<link href="http://example.com/5/1"/>
<title>Test 5/1</title>
<content>Entry from unsubbed feed</content>
<updated>2007-03-01T05:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5</id>
<title>Test 5 source</title>
</source>
</entry>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed1.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed2.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed3.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed4.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
</feed>

83
tests/test_expunge.py Normal file
View File

@ -0,0 +1,83 @@
#!/usr/bin/env python
import unittest, os, glob, shutil, time
from planet.spider import filename
from planet import feedparser, config
from planet.expunge import expungeCache
from xml.dom import minidom
import planet
workdir = 'tests/work/expunge/cache'
sourcesdir = 'tests/work/expunge/cache/sources'
testentries = 'tests/data/expunge/test*.entry'
testfeeds = 'tests/data/expunge/test*.atom'
configfile = 'tests/data/expunge/config.ini'
class ExpungeTest(unittest.TestCase):
def setUp(self):
# silence errors
planet.logger = None
planet.getLogger('CRITICAL',None)
try:
os.makedirs(workdir)
os.makedirs(sourcesdir)
except:
self.tearDown()
os.makedirs(workdir)
os.makedirs(sourcesdir)
def tearDown(self):
shutil.rmtree(workdir)
os.removedirs(os.path.split(workdir)[0])
def test_expunge(self):
config.load(configfile)
# create test entries in cache with correct timestamp
for entry in glob.glob(testentries):
e=minidom.parse(entry)
e.normalize()
eid = e.getElementsByTagName('id')
efile = filename(workdir, eid[0].childNodes[0].nodeValue)
eupdated = e.getElementsByTagName('updated')[0].childNodes[0].nodeValue
emtime = time.mktime(feedparser._parse_date_w3dtf(eupdated))
if not eid or not eupdated: continue
shutil.copyfile(entry, efile)
os.utime(efile, (emtime, emtime))
# create test feeds in cache
sources = config.cache_sources_directory()
for feed in glob.glob(testfeeds):
f=minidom.parse(feed)
f.normalize()
fid = f.getElementsByTagName('id')
if not fid: continue
ffile = filename(sources, fid[0].childNodes[0].nodeValue)
shutil.copyfile(feed, ffile)
# verify that exactly nine entries + one source dir were produced
files = glob.glob(workdir+"/*")
self.assertEqual(10, len(files))
# verify that exactly four feeds were produced in source dir
files = glob.glob(sources+"/*")
self.assertEqual(4, len(files))
# expunge...
expungeCache()
# verify that five entries and one source dir are left
files = glob.glob(workdir+"/*")
self.assertEqual(6, len(files))
# verify that the right five entries are left
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test1,1') in files)
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test2,1') in files)
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test3,3') in files)
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test4,2') in files)
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test4,3') in files)