Added expunge and preliminary test cases

This commit is contained in:
Morten Frederiksen 2007-03-04 12:00:28 +01:00
parent 567eb644b8
commit a51d09ec07
21 changed files with 387 additions and 1 deletions

2
THANKS
View File

@ -9,7 +9,7 @@ Harry Fuecks - Pipe characters in file names, filter bug
Eric van der Vlist - Filters to add language, category information
Chris Dolan - mkdir cache; default template_dirs; fix xsltproc
David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/
Morten Fredericksen - Support WordPress LinkManager OPML
Morten Frederiksen - Support WordPress LinkManager OPML
Harry Fuecks - default item date to feed date
Antonio Cavedoni - Django templates

View File

@ -111,6 +111,10 @@ no threads are used and spidering follows the traditional algorithm.</dd>
directory to be used for an additional HTTP cache to front end the Venus
cache. If specified as a relative path, it is evaluated relative to the
<code>cache_directory</code>.</dd>
<dt><ins>cache_keep_entries</ins></dt>
<dd>Used by <code>expunge</code> to determine how many entries should be
kept for each source when expunging old entries from the cache directory.
This may be overriden on a per subscription feed basis.</dd>
</dl>
<p>Additional options can be found in
<a href="normalization.html#overrides">normalization level overrides</a>.</p>

17
expunge.py Normal file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env python
"""
Main program to run just the expunge portion of planet
"""
import os.path
import sys
from planet import expunge, config
if __name__ == '__main__':
if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
config.load(sys.argv[1])
expunge.expungeCache()
else:
print "Usage:"
print " python %s config.ini" % sys.argv[0]

View File

@ -21,6 +21,7 @@ if __name__ == "__main__":
offline = 0
verbose = 0
only_if_new = 0
expunge = 0
for arg in sys.argv[1:]:
if arg == "-h" or arg == "--help":
@ -31,6 +32,7 @@ if __name__ == "__main__":
print " -o, --offline Update the Planet from the cache only"
print " -h, --help Display this help message and exit"
print " -n, --only-if-new Only spider new feeds"
print " -x, --expunge Expunge old entries from cache"
print
sys.exit(0)
elif arg == "-v" or arg == "--verbose":
@ -39,6 +41,8 @@ if __name__ == "__main__":
offline = 1
elif arg == "-n" or arg == "--only-if-new":
only_if_new = 1
elif arg == "-x" or arg == "--expunge":
expunge = 1
elif arg.startswith("-"):
print >>sys.stderr, "Unknown option:", arg
sys.exit(1)
@ -62,3 +66,7 @@ if __name__ == "__main__":
from planet import splice
doc = splice.splice()
splice.apply(doc.toxml('utf-8'))
if expunge:
from planet import expunge
expunge.expungeCache

View File

@ -107,6 +107,7 @@ def __init__():
define_planet('spider_threads', 0)
define_planet_int('feed_timeout', 20)
define_planet_int('cache_keep_entries', 10)
define_planet_list('template_files')
define_planet_list('bill_of_materials')

68
planet/expunge.py Normal file
View File

@ -0,0 +1,68 @@
""" Expunge old entries from a cache of entries """
import glob, os, planet, config, feedparser
from xml.dom import minidom
from spider import filename
def expungeCache():
""" Expunge old entries from a cache of entries """
import planet
log = planet.getLogger(config.log_level(),config.log_format())
log.info("Determining feed subscriptions")
entry_count = {}
sources = config.cache_sources_directory()
for sub in config.subscriptions():
data=feedparser.parse(filename(sources,sub))
if not data.feed.has_key('id'): continue
if config.feed_options(sub).has_key('cache_keep_entries'):
entry_count[data.feed.id] = config.feed_options(sub)['cache_keep_entries']
else:
entry_count[data.feed.id] = config.cache_keep_entries()
log.info("Listing cached entries")
cache = config.cache_directory()
dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
if not os.path.isdir(file)]
dir.sort()
dir.reverse()
for mtime,file in dir:
try:
entry=minidom.parse(file)
# determine source of entry
entry.normalize()
sources = entry.getElementsByTagName('source')
if not sources:
# no source determined, do not delete
log.debug("No source found for %s", file)
continue
ids = sources[0].getElementsByTagName('id')
if not ids:
# feed id not found, do not delete
log.debug("No source feed id found for %s", file)
continue
if ids[0].childNodes[0].nodeValue in entry_count:
# subscribed to feed, update entry count
entry_count[ids[0].childNodes[0].nodeValue] = entry_count[
ids[0].childNodes[0].nodeValue] - 1
if entry_count[ids[0].childNodes[0].nodeValue] >= 0:
# maximum not reached, do not delete
log.debug("Maximum not reached for %s from %s",
file, ids[0].childNodes[0].nodeValue)
continue
else:
# maximum reached
log.debug("Removing %s, maximum reached for %s",
file, ids[0].childNodes[0].nodeValue)
else:
# not subscribed
log.debug("Removing %s, not subscribed to %s",
file, ids[0].childNodes[0].nodeValue)
# remove old entry
#os.unlink(file)
except:
log.error("Error parsing %s", file)
# end of expungeCache()

View File

@ -0,0 +1,20 @@
[Planet]
name = test planet
cache_directory = tests/work/expunge/cache
cache_keep_entries = 1
[tests/data/expunge/testfeed1.atom]
name = no source
[tests/data/expunge/testfeed2.atom]
name = no source id
[tests/data/expunge/testfeed3.atom]
name = global setting
[tests/data/expunge/testfeed4.atom]
name = local setting
cache_keep_entries = 2
#[tests/data/expunge/testfeed5.atom]
#name = unsubbed

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test1/1</id>
<link href="http://example.com/1/1"/>
<title>Test 1/1</title>
<content>Entry with missing source</content>
<updated>2007-03-01T01:01:00Z</updated>
</entry>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test2/1</id>
<link href="http://example.com/2/1"/>
<title>Test 2/1</title>
<content>Entry with missing source id</content>
<updated>2007-03-01T02:01:00Z</updated>
<source>
<title>Test 2/1 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/1</id>
<link href="http://example.com/3/1"/>
<title>Test 3/1</title>
<content>Entry for global setting 1</content>
<updated>2007-03-01T03:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/2</id>
<link href="http://example.com/3/2"/>
<title>Test 3/2</title>
<content>Entry for global setting 2</content>
<updated>2007-03-01T03:02:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/3</id>
<link href="http://example.com/3/3"/>
<title>Test 3/3</title>
<content>Entry for global setting 3</content>
<updated>2007-03-01T03:03:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
<title>Test 3 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/1</id>
<link href="http://example.com/4/1"/>
<title>Test 4/1</title>
<content>Entry for local setting 1</content>
<updated>2007-03-01T04:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/2</id>
<link href="http://example.com/4/2"/>
<title>Test 4/2</title>
<content>Entry for local setting 2</content>
<updated>2007-03-01T04:02:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/3</id>
<link href="http://example.com/4/3"/>
<title>Test 4/3</title>
<content>Entry for local setting 3</content>
<updated>2007-03-01T04:03:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
<title>Test 4 source</title>
</source>
</entry>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<entry xmlns="http://www.w3.org/2005/Atom">
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test5/1</id>
<link href="http://example.com/5/1"/>
<title>Test 5/1</title>
<content>Entry from unsubbed feed</content>
<updated>2007-03-01T05:01:00Z</updated>
<source>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5</id>
<title>Test 5 source</title>
</source>
</entry>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed1.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed2.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed3.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
</feed>

View File

@ -0,0 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed4.atom"/>
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
</feed>

145
tests/test_expunge.py Normal file
View File

@ -0,0 +1,145 @@
#!/usr/bin/env python
#import unittest, os, glob, calendar, shutil, time
#from planet.spider import filename, spiderPlanet, writeCache
#from planet import feedparser, config
#import planet
workdir = 'tests/work/expunge/cache'
testfeed = 'tests/data/expunge/testfeed%s.atom'
configfile = 'tests/data/expunge/config.ini'
class ExpungeTest(unittest.TestCase):
def setUp(self):
# silence errors
planet.logger = None
planet.getLogger('CRITICAL',None)
try:
os.makedirs(workdir)
except:
self.tearDown()
os.makedirs(workdir)
def tearDown(self):
shutil.rmtree(workdir)
os.removedirs(os.path.split(workdir)[0])
def test_filename(self):
self.assertEqual(os.path.join('.', 'example.com,index.html'),
filename('.', 'http://example.com/index.html'))
self.assertEqual(os.path.join('.',
'planet.intertwingly.net,2006,testfeed1,1'),
filename('.', u'tag:planet.intertwingly.net,2006:testfeed1,1'))
self.assertEqual(os.path.join('.',
'00000000-0000-0000-0000-000000000000'),
filename('.', u'urn:uuid:00000000-0000-0000-0000-000000000000'))
# Requires Python 2.3
try:
import encodings.idna
except:
return
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
def spiderFeed(self, feed_uri):
feed_info = feedparser.parse('<feed/>')
data = feedparser.parse(feed_uri)
writeCache(feed_uri, feed_info, data)
def verify_spiderFeed(self):
files = glob.glob(workdir+"/*")
files.sort()
# verify that exactly four files + one sources dir were produced
self.assertEqual(5, len(files))
# verify that the file names are as expected
self.assertTrue(os.path.join(workdir,
'planet.intertwingly.net,2006,testfeed1,1') in files)
# verify that the file timestamps match atom:updated
data = feedparser.parse(files[2])
self.assertEqual(['application/atom+xml'], [link.type
for link in data.entries[0].source.links if link.rel=='self'])
self.assertEqual('one', data.entries[0].source.planet_name)
self.assertEqual('2006-01-03T00:00:00Z', data.entries[0].updated)
self.assertEqual(os.stat(files[2]).st_mtime,
calendar.timegm(data.entries[0].updated_parsed))
def test_spiderFeed(self):
config.load(configfile)
self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed()
def test_spiderUpdate(self):
config.load(configfile)
self.spiderFeed(testfeed % '1a')
self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed()
def verify_spiderPlanet(self):
files = glob.glob(workdir+"/*")
# verify that exactly eight files + 1 source dir were produced
self.assertEqual(14, len(files))
# verify that the file names are as expected
self.assertTrue(os.path.join(workdir,
'planet.intertwingly.net,2006,testfeed1,1') in files)
self.assertTrue(os.path.join(workdir,
'planet.intertwingly.net,2006,testfeed2,1') in files)
data = feedparser.parse(workdir +
'/planet.intertwingly.net,2006,testfeed3,1')
self.assertEqual(['application/rss+xml'], [link.type
for link in data.entries[0].source.links if link.rel=='self'])
self.assertEqual('three', data.entries[0].source.author_detail.name)
self.assertEqual('three', data.entries[0].source['planet_css-id'])
def test_spiderPlanet(self):
config.load(configfile)
spiderPlanet()
self.verify_spiderPlanet()
def test_spiderThreads(self):
config.load(configfile.replace('config','threaded'))
_PORT = config.parser.getint('Planet','test_port')
log = []
from SimpleHTTPServer import SimpleHTTPRequestHandler
class TestRequestHandler(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
log.append(args)
from threading import Thread
class TestServerThread(Thread):
def __init__(self):
self.ready = 0
self.done = 0
Thread.__init__(self)
def run(self):
from BaseHTTPServer import HTTPServer
httpd = HTTPServer(('',_PORT), TestRequestHandler)
self.ready = 1
while not self.done:
httpd.handle_request()
httpd = TestServerThread()
httpd.start()
while not httpd.ready:
time.sleep(0.1)
try:
spiderPlanet()
finally:
httpd.done = 1
import urllib
urllib.urlopen('http://127.0.0.1:%d/' % _PORT).read()
status = [int(rec[1]) for rec in log if str(rec[0]).startswith('GET ')]
status.sort()
self.assertEqual([200,200,200,200,404], status)
self.verify_spiderPlanet()