Added expunge and test cases

This commit is contained in:
Morten Frederiksen 2007-03-04 15:23:20 +01:00
commit 5f21b167ff
21 changed files with 325 additions and 1 deletions

2
THANKS
View File

@ -9,7 +9,7 @@ Harry Fuecks - Pipe characters in file names, filter bug
Eric van der Vlist - Filters to add language, category information
Chris Dolan - mkdir cache; default template_dirs; fix xsltproc
David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/
Morten Fredericksen - Support WordPress LinkManager OPML
Morten Frederiksen - Support WordPress LinkManager OPML
Harry Fuecks - default item date to feed date
Antonio Cavedoni - Django templates

View File

@ -111,6 +111,10 @@ no threads are used and spidering follows the traditional algorithm.</dd>
directory to be used for an additional HTTP cache to front end the Venus
cache. If specified as a relative path, it is evaluated relative to the
<code>cache_directory</code>.</dd>
<dt><ins>cache_keep_entries</ins></dt>
<dd>Used by <code>expunge</code> to determine how many entries should be
kept for each source when expunging old entries from the cache directory.
This may be overriden on a per subscription feed basis.</dd>
</dl>
<p>Additional options can be found in
<a href="normalization.html#overrides">normalization level overrides</a>.</p>

17
expunge.py Normal file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env python
"""
Main program to run just the expunge portion of planet
"""
import os.path
import sys
from planet import expunge, config
if __name__ == '__main__':
if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
config.load(sys.argv[1])
expunge.expungeCache()
else:
print "Usage:"
print " python %s config.ini" % sys.argv[0]

View File

@ -21,6 +21,7 @@ if __name__ == "__main__":
offline = 0
verbose = 0
only_if_new = 0
expunge = 0
for arg in sys.argv[1:]:
if arg == "-h" or arg == "--help":
@ -31,6 +32,7 @@ if __name__ == "__main__":
print " -o, --offline Update the Planet from the cache only"
print " -h, --help Display this help message and exit"
print " -n, --only-if-new Only spider new feeds"
print " -x, --expunge Expunge old entries from cache"
print
sys.exit(0)
elif arg == "-v" or arg == "--verbose":
@ -39,6 +41,8 @@ if __name__ == "__main__":
offline = 1
elif arg == "-n" or arg == "--only-if-new":
only_if_new = 1
elif arg == "-x" or arg == "--expunge":
expunge = 1
elif arg.startswith("-"):
print >>sys.stderr, "Unknown option:", arg
sys.exit(1)
@ -62,3 +66,7 @@ if __name__ == "__main__":
from planet import splice
doc = splice.splice()
splice.apply(doc.toxml('utf-8'))
if expunge:
from planet import expunge
expunge.expungeCache

View File

@ -107,6 +107,7 @@ def __init__():
define_planet('spider_threads', 0)
define_planet_int('feed_timeout', 20)
define_planet_int('cache_keep_entries', 10)
define_planet_list('template_files')
define_planet_list('bill_of_materials')

68
planet/expunge.py Normal file
View File

@ -0,0 +1,68 @@
""" Expunge old entries from a cache of entries """
import glob, os, planet, config, feedparser
from xml.dom import minidom
from spider import filename
def expungeCache():
""" Expunge old entries from a cache of entries """
import planet
log = planet.getLogger(config.log_level(),config.log_format())
log.info("Determining feed subscriptions")
entry_count = {}
sources = config.cache_sources_directory()
for sub in config.subscriptions():
data=feedparser.parse(filename(sources,sub))
if not data.feed.has_key('id'): continue
if config.feed_options(sub).has_key('cache_keep_entries'):
entry_count[data.feed.id] = int(config.feed_options(sub)['cache_keep_entries'])
else:
entry_count[data.feed.id] = config.cache_keep_entries()
log.info("Listing cached entries")
cache = config.cache_directory()
dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
if not os.path.isdir(file)]
dir.sort()
dir.reverse()
for mtime,file in dir:
try:
entry=minidom.parse(file)
# determine source of entry
entry.normalize()
sources = entry.getElementsByTagName('source')
if not sources:
# no source determined, do not delete
log.debug("No source found for %s", file)
continue
ids = sources[0].getElementsByTagName('id')
if not ids:
# feed id not found, do not delete
log.debug("No source feed id found for %s", file)
continue
if ids[0].childNodes[0].nodeValue in entry_count:
# subscribed to feed, update entry count
entry_count[ids[0].childNodes[0].nodeValue] = entry_count[
ids[0].childNodes[0].nodeValue] - 1
if entry_count[ids[0].childNodes[0].nodeValue] >= 0:
# maximum not reached, do not delete
log.debug("Maximum not reached for %s from %s",
file, ids[0].childNodes[0].nodeValue)
continue
else:
# maximum reached
log.debug("Removing %s, maximum reached for %s",
file, ids[0].childNodes[0].nodeValue)
else:
# not subscribed
log.debug("Removing %s, not subscribed to %s",
file, ids[0].childNodes[0].nodeValue)
# remove old entry
os.unlink(file)
except:
log.error("Error parsing %s", file)
# end of expungeCache()

View File

@ -0,0 +1,20 @@
[Planet]
name = test planet
cache_directory = tests/work/expunge/cache
cache_keep_entries = 1
[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1]
name = no source
[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2]
name = no source id
[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3]
name = global setting
[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4]
name = local setting
cache_keep_entries = 2
#[tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5]
#name = unsubbed

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test1/1</id>
<link href="http://example.com/1/1"/>
<title>Test 1/1</title>
<content>Entry with missing source</content>
<updated>2007-03-01T01:01:00Z</updated>
</entry>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test2/1</id>
<link href="http://example.com/2/1"/>
<title>Test 2/1</title>
<content>Entry with missing source id</content>
<updated>2007-03-01T02:01:00Z</updated>
<source>
<title>Test 2/1 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/1</id>
<link href="http://example.com/3/1"/>
<title>Test 3/1</title>
<content>Entry for global setting 1</content>
<updated>2007-03-01T03:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/2</id>
<link href="http://example.com/3/2"/>
<title>Test 3/2</title>
<content>Entry for global setting 2</content>
<updated>2007-03-01T03:02:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/3</id>
<link href="http://example.com/3/3"/>
<title>Test 3/3</title>
<content>Entry for global setting 3</content>
<updated>2007-03-01T03:03:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/1</id>
<link href="http://example.com/4/1"/>
<title>Test 4/1</title>
<content>Entry for local setting 1</content>
<updated>2007-03-01T04:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/2</id>
<link href="http://example.com/4/2"/>
<title>Test 4/2</title>
<content>Entry for local setting 2</content>
<updated>2007-03-01T04:02:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/3</id>
<link href="http://example.com/4/3"/>
<title>Test 4/3</title>
<content>Entry for local setting 3</content>
<updated>2007-03-01T04:03:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test5/1</id>
<link href="http://example.com/5/1"/>
<title>Test 5/1</title>
<content>Entry from unsubbed feed</content>
<updated>2007-03-01T05:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5</id>
<title>Test 5 source</title>
</source>
</entry>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed1.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed2.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed3.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed4.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
</feed>

83
tests/test_expunge.py Normal file
View File

@ -0,0 +1,83 @@
#!/usr/bin/env python
import unittest, os, glob, shutil, time
from planet.spider import filename
from planet import feedparser, config
from planet.expunge import expungeCache
from xml.dom import minidom
import planet
workdir = 'tests/work/expunge/cache'
sourcesdir = 'tests/work/expunge/cache/sources'
testentries = 'tests/data/expunge/test*.entry'
testfeeds = 'tests/data/expunge/test*.atom'
configfile = 'tests/data/expunge/config.ini'
class ExpungeTest(unittest.TestCase):
def setUp(self):
# silence errors
planet.logger = None
planet.getLogger('CRITICAL',None)
try:
os.makedirs(workdir)
os.makedirs(sourcesdir)
except:
self.tearDown()
os.makedirs(workdir)
os.makedirs(sourcesdir)
def tearDown(self):
shutil.rmtree(workdir)
os.removedirs(os.path.split(workdir)[0])
def test_expunge(self):
config.load(configfile)
# create test entries in cache with correct timestamp
for entry in glob.glob(testentries):
e=minidom.parse(entry)
e.normalize()
eid = e.getElementsByTagName('id')
efile = filename(workdir, eid[0].childNodes[0].nodeValue)
eupdated = e.getElementsByTagName('updated')[0].childNodes[0].nodeValue
emtime = time.mktime(feedparser._parse_date_w3dtf(eupdated))
if not eid or not eupdated: continue
shutil.copyfile(entry, efile)
os.utime(efile, (emtime, emtime))
# create test feeds in cache
sources = config.cache_sources_directory()
for feed in glob.glob(testfeeds):
f=minidom.parse(feed)
f.normalize()
fid = f.getElementsByTagName('id')
if not fid: continue
ffile = filename(sources, fid[0].childNodes[0].nodeValue)
shutil.copyfile(feed, ffile)
# verify that exactly nine entries + one source dir were produced
files = glob.glob(workdir+"/*")
self.assertEqual(10, len(files))
# verify that exactly four feeds were produced in source dir
files = glob.glob(sources+"/*")
self.assertEqual(4, len(files))
# expunge...
expungeCache()
# verify that five entries and one source dir are left
files = glob.glob(workdir+"/*")
self.assertEqual(6, len(files))
# verify that the right five entries are left
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test1,1') in files)
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test2,1') in files)
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test3,3') in files)
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test4,2') in files)
self.assertTrue(os.path.join(workdir,
'bzr.mfd-consult.dk,2007,venus-expunge-test4,3') in files)