Added expunge and preliminary test cases

This commit is contained in:
Morten Frederiksen 2007-03-04 12:00:28 +01:00
parent 567eb644b8
commit a51d09ec07
21 changed files with 387 additions and 1 deletions

2
THANKS
View File

@ -9,7 +9,7 @@ Harry Fuecks - Pipe characters in file names, filter bug
Eric van der Vlist - Filters to add language, category information Eric van der Vlist - Filters to add language, category information
Chris Dolan - mkdir cache; default template_dirs; fix xsltproc Chris Dolan - mkdir cache; default template_dirs; fix xsltproc
David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/ David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/
Morten Fredericksen - Support WordPress LinkManager OPML Morten Frederiksen - Support WordPress LinkManager OPML
Harry Fuecks - default item date to feed date Harry Fuecks - default item date to feed date
Antonio Cavedoni - Django templates Antonio Cavedoni - Django templates

View File

@ -111,6 +111,10 @@ no threads are used and spidering follows the traditional algorithm.</dd>
directory to be used for an additional HTTP cache to front end the Venus directory to be used for an additional HTTP cache to front end the Venus
cache. If specified as a relative path, it is evaluated relative to the cache. If specified as a relative path, it is evaluated relative to the
<code>cache_directory</code>.</dd> <code>cache_directory</code>.</dd>
<dt><ins>cache_keep_entries</ins></dt>
<dd>Used by <code>expunge</code> to determine how many entries should be
kept for each source when expunging old entries from the cache directory.
This may be overriden on a per subscription feed basis.</dd>
</dl> </dl>
<p>Additional options can be found in <p>Additional options can be found in
<a href="normalization.html#overrides">normalization level overrides</a>.</p> <a href="normalization.html#overrides">normalization level overrides</a>.</p>

17
expunge.py Normal file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env python
"""
Main program to run just the expunge portion of planet
"""
import os.path
import sys
from planet import expunge, config
if __name__ == '__main__':
if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
config.load(sys.argv[1])
expunge.expungeCache()
else:
print "Usage:"
print " python %s config.ini" % sys.argv[0]

View File

@ -21,6 +21,7 @@ if __name__ == "__main__":
offline = 0 offline = 0
verbose = 0 verbose = 0
only_if_new = 0 only_if_new = 0
expunge = 0
for arg in sys.argv[1:]: for arg in sys.argv[1:]:
if arg == "-h" or arg == "--help": if arg == "-h" or arg == "--help":
@ -31,6 +32,7 @@ if __name__ == "__main__":
print " -o, --offline Update the Planet from the cache only" print " -o, --offline Update the Planet from the cache only"
print " -h, --help Display this help message and exit" print " -h, --help Display this help message and exit"
print " -n, --only-if-new Only spider new feeds" print " -n, --only-if-new Only spider new feeds"
print " -x, --expunge Expunge old entries from cache"
print print
sys.exit(0) sys.exit(0)
elif arg == "-v" or arg == "--verbose": elif arg == "-v" or arg == "--verbose":
@ -39,6 +41,8 @@ if __name__ == "__main__":
offline = 1 offline = 1
elif arg == "-n" or arg == "--only-if-new": elif arg == "-n" or arg == "--only-if-new":
only_if_new = 1 only_if_new = 1
elif arg == "-x" or arg == "--expunge":
expunge = 1
elif arg.startswith("-"): elif arg.startswith("-"):
print >>sys.stderr, "Unknown option:", arg print >>sys.stderr, "Unknown option:", arg
sys.exit(1) sys.exit(1)
@ -62,3 +66,7 @@ if __name__ == "__main__":
from planet import splice from planet import splice
doc = splice.splice() doc = splice.splice()
splice.apply(doc.toxml('utf-8')) splice.apply(doc.toxml('utf-8'))
if expunge:
from planet import expunge
expunge.expungeCache

View File

@ -107,6 +107,7 @@ def __init__():
define_planet('spider_threads', 0) define_planet('spider_threads', 0)
define_planet_int('feed_timeout', 20) define_planet_int('feed_timeout', 20)
define_planet_int('cache_keep_entries', 10)
define_planet_list('template_files') define_planet_list('template_files')
define_planet_list('bill_of_materials') define_planet_list('bill_of_materials')

68
planet/expunge.py Normal file
View File

@ -0,0 +1,68 @@
""" Expunge old entries from a cache of entries """
import glob, os, planet, config, feedparser
from xml.dom import minidom
from spider import filename
def expungeCache():
""" Expunge old entries from a cache of entries """
import planet
log = planet.getLogger(config.log_level(),config.log_format())
log.info("Determining feed subscriptions")
entry_count = {}
sources = config.cache_sources_directory()
for sub in config.subscriptions():
data=feedparser.parse(filename(sources,sub))
if not data.feed.has_key('id'): continue
if config.feed_options(sub).has_key('cache_keep_entries'):
entry_count[data.feed.id] = config.feed_options(sub)['cache_keep_entries']
else:
entry_count[data.feed.id] = config.cache_keep_entries()
log.info("Listing cached entries")
cache = config.cache_directory()
dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
if not os.path.isdir(file)]
dir.sort()
dir.reverse()
for mtime,file in dir:
try:
entry=minidom.parse(file)
# determine source of entry
entry.normalize()
sources = entry.getElementsByTagName('source')
if not sources:
# no source determined, do not delete
log.debug("No source found for %s", file)
continue
ids = sources[0].getElementsByTagName('id')
if not ids:
# feed id not found, do not delete
log.debug("No source feed id found for %s", file)
continue
if ids[0].childNodes[0].nodeValue in entry_count:
# subscribed to feed, update entry count
entry_count[ids[0].childNodes[0].nodeValue] = entry_count[
ids[0].childNodes[0].nodeValue] - 1
if entry_count[ids[0].childNodes[0].nodeValue] >= 0:
# maximum not reached, do not delete
log.debug("Maximum not reached for %s from %s",
file, ids[0].childNodes[0].nodeValue)
continue
else:
# maximum reached
log.debug("Removing %s, maximum reached for %s",
file, ids[0].childNodes[0].nodeValue)
else:
# not subscribed
log.debug("Removing %s, not subscribed to %s",
file, ids[0].childNodes[0].nodeValue)
# remove old entry
#os.unlink(file)
except:
log.error("Error parsing %s", file)
# end of expungeCache()

View File

@ -0,0 +1,20 @@
[Planet]
name = test planet
cache_directory = tests/work/expunge/cache
cache_keep_entries = 1
[tests/data/expunge/testfeed1.atom]
name = no source
[tests/data/expunge/testfeed2.atom]
name = no source id
[tests/data/expunge/testfeed3.atom]
name = global setting
[tests/data/expunge/testfeed4.atom]
name = local setting
cache_keep_entries = 2
#[tests/data/expunge/testfeed5.atom]
#name = unsubbed

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test1/1</id>
<link href="http://example.com/1/1"/>
<title>Test 1/1</title>
<content>Entry with missing source</content>
<updated>2007-03-01T01:01:00Z</updated>
</entry>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test2/1</id>
<link href="http://example.com/2/1"/>
<title>Test 2/1</title>
<content>Entry with missing source id</content>
<updated>2007-03-01T02:01:00Z</updated>
<source>
<title>Test 2/1 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/1</id>
<link href="http://example.com/3/1"/>
<title>Test 3/1</title>
<content>Entry for global setting 1</content>
<updated>2007-03-01T03:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/2</id>
<link href="http://example.com/3/2"/>
<title>Test 3/2</title>
<content>Entry for global setting 2</content>
<updated>2007-03-01T03:02:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/3</id>
<link href="http://example.com/3/3"/>
<title>Test 3/3</title>
<content>Entry for global setting 3</content>
<updated>2007-03-01T03:03:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/1</id>
<link href="http://example.com/4/1"/>
<title>Test 4/1</title>
<content>Entry for local setting 1</content>
<updated>2007-03-01T04:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/2</id>
<link href="http://example.com/4/2"/>
<title>Test 4/2</title>
<content>Entry for local setting 2</content>
<updated>2007-03-01T04:02:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/3</id>
<link href="http://example.com/4/3"/>
<title>Test 4/3</title>
<content>Entry for local setting 3</content>
<updated>2007-03-01T04:03:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test5/1</id>
<link href="http://example.com/5/1"/>
<title>Test 5/1</title>
<content>Entry from unsubbed feed</content>
<updated>2007-03-01T05:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5</id>
<title>Test 5 source</title>
</source>
</entry>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed1.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed2.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed3.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed4.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
</feed>

145
tests/test_expunge.py Normal file
View File

@ -0,0 +1,145 @@
#!/usr/bin/env python
#import unittest, os, glob, calendar, shutil, time
#from planet.spider import filename, spiderPlanet, writeCache
#from planet import feedparser, config
#import planet
workdir = 'tests/work/expunge/cache'
testfeed = 'tests/data/expunge/testfeed%s.atom'
configfile = 'tests/data/expunge/config.ini'
class ExpungeTest(unittest.TestCase):
def setUp(self):
# silence errors
planet.logger = None
planet.getLogger('CRITICAL',None)
try:
os.makedirs(workdir)
except:
self.tearDown()
os.makedirs(workdir)
def tearDown(self):
shutil.rmtree(workdir)
os.removedirs(os.path.split(workdir)[0])
def test_filename(self):
self.assertEqual(os.path.join('.', 'example.com,index.html'),
filename('.', 'http://example.com/index.html'))
self.assertEqual(os.path.join('.',
'planet.intertwingly.net,2006,testfeed1,1'),
filename('.', u'tag:planet.intertwingly.net,2006:testfeed1,1'))
self.assertEqual(os.path.join('.',
'00000000-0000-0000-0000-000000000000'),
filename('.', u'urn:uuid:00000000-0000-0000-0000-000000000000'))
# Requires Python 2.3
try:
import encodings.idna
except:
return
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
def spiderFeed(self, feed_uri):
feed_info = feedparser.parse('<feed/>')
data = feedparser.parse(feed_uri)
writeCache(feed_uri, feed_info, data)
def verify_spiderFeed(self):
files = glob.glob(workdir+"/*")
files.sort()
# verify that exactly four files + one sources dir were produced
self.assertEqual(5, len(files))
# verify that the file names are as expected
self.assertTrue(os.path.join(workdir,
'planet.intertwingly.net,2006,testfeed1,1') in files)
# verify that the file timestamps match atom:updated
data = feedparser.parse(files[2])
self.assertEqual(['application/atom+xml'], [link.type
for link in data.entries[0].source.links if link.rel=='self'])
self.assertEqual('one', data.entries[0].source.planet_name)
self.assertEqual('2006-01-03T00:00:00Z', data.entries[0].updated)
self.assertEqual(os.stat(files[2]).st_mtime,
calendar.timegm(data.entries[0].updated_parsed))
def test_spiderFeed(self):
config.load(configfile)
self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed()
def test_spiderUpdate(self):
config.load(configfile)
self.spiderFeed(testfeed % '1a')
self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed()
def verify_spiderPlanet(self):
files = glob.glob(workdir+"/*")
# verify that exactly eight files + 1 source dir were produced
self.assertEqual(14, len(files))
# verify that the file names are as expected
self.assertTrue(os.path.join(workdir,
'planet.intertwingly.net,2006,testfeed1,1') in files)
self.assertTrue(os.path.join(workdir,
'planet.intertwingly.net,2006,testfeed2,1') in files)
data = feedparser.parse(workdir +
'/planet.intertwingly.net,2006,testfeed3,1')
self.assertEqual(['application/rss+xml'], [link.type
for link in data.entries[0].source.links if link.rel=='self'])
self.assertEqual('three', data.entries[0].source.author_detail.name)
self.assertEqual('three', data.entries[0].source['planet_css-id'])
def test_spiderPlanet(self):
config.load(configfile)
spiderPlanet()
self.verify_spiderPlanet()
def test_spiderThreads(self):
config.load(configfile.replace('config','threaded'))
_PORT = config.parser.getint('Planet','test_port')
log = []
from SimpleHTTPServer import SimpleHTTPRequestHandler
class TestRequestHandler(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
log.append(args)
from threading import Thread
class TestServerThread(Thread):
def __init__(self):
self.ready = 0
self.done = 0
Thread.__init__(self)
def run(self):
from BaseHTTPServer import HTTPServer
httpd = HTTPServer(('',_PORT), TestRequestHandler)
self.ready = 1
while not self.done:
httpd.handle_request()
httpd = TestServerThread()
httpd.start()
while not httpd.ready:
time.sleep(0.1)
try:
spiderPlanet()
finally:
httpd.done = 1
import urllib
urllib.urlopen('http://127.0.0.1:%d/' % _PORT).read()
status = [int(rec[1]) for rec in log if str(rec[0]).startswith('GET ')]
status.sort()
self.assertEqual([200,200,200,200,404], status)
self.verify_spiderPlanet()