Added expunge and preliminary test cases
This commit is contained in:
parent
567eb644b8
commit
a51d09ec07
2
THANKS
2
THANKS
@ -9,7 +9,7 @@ Harry Fuecks - Pipe characters in file names, filter bug
|
||||
Eric van der Vlist - Filters to add language, category information
|
||||
Chris Dolan - mkdir cache; default template_dirs; fix xsltproc
|
||||
David Sifry - rss 2.0 xslt template based on http://atom.geekhood.net/
|
||||
Morten Fredericksen - Support WordPress LinkManager OPML
|
||||
Morten Frederiksen - Support WordPress LinkManager OPML
|
||||
Harry Fuecks - default item date to feed date
|
||||
Antonio Cavedoni - Django templates
|
||||
|
||||
|
@ -111,6 +111,10 @@ no threads are used and spidering follows the traditional algorithm.</dd>
|
||||
directory to be used for an additional HTTP cache to front end the Venus
|
||||
cache. If specified as a relative path, it is evaluated relative to the
|
||||
<code>cache_directory</code>.</dd>
|
||||
<dt><ins>cache_keep_entries</ins></dt>
|
||||
<dd>Used by <code>expunge</code> to determine how many entries should be
|
||||
kept for each source when expunging old entries from the cache directory.
|
||||
This may be overriden on a per subscription feed basis.</dd>
|
||||
</dl>
|
||||
<p>Additional options can be found in
|
||||
<a href="normalization.html#overrides">normalization level overrides</a>.</p>
|
||||
|
17
expunge.py
Normal file
17
expunge.py
Normal file
@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Main program to run just the expunge portion of planet
|
||||
"""
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
from planet import expunge, config
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
|
||||
config.load(sys.argv[1])
|
||||
expunge.expungeCache()
|
||||
else:
|
||||
print "Usage:"
|
||||
print " python %s config.ini" % sys.argv[0]
|
@ -21,6 +21,7 @@ if __name__ == "__main__":
|
||||
offline = 0
|
||||
verbose = 0
|
||||
only_if_new = 0
|
||||
expunge = 0
|
||||
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == "-h" or arg == "--help":
|
||||
@ -31,6 +32,7 @@ if __name__ == "__main__":
|
||||
print " -o, --offline Update the Planet from the cache only"
|
||||
print " -h, --help Display this help message and exit"
|
||||
print " -n, --only-if-new Only spider new feeds"
|
||||
print " -x, --expunge Expunge old entries from cache"
|
||||
print
|
||||
sys.exit(0)
|
||||
elif arg == "-v" or arg == "--verbose":
|
||||
@ -39,6 +41,8 @@ if __name__ == "__main__":
|
||||
offline = 1
|
||||
elif arg == "-n" or arg == "--only-if-new":
|
||||
only_if_new = 1
|
||||
elif arg == "-x" or arg == "--expunge":
|
||||
expunge = 1
|
||||
elif arg.startswith("-"):
|
||||
print >>sys.stderr, "Unknown option:", arg
|
||||
sys.exit(1)
|
||||
@ -62,3 +66,7 @@ if __name__ == "__main__":
|
||||
from planet import splice
|
||||
doc = splice.splice()
|
||||
splice.apply(doc.toxml('utf-8'))
|
||||
|
||||
if expunge:
|
||||
from planet import expunge
|
||||
expunge.expungeCache
|
||||
|
@ -107,6 +107,7 @@ def __init__():
|
||||
define_planet('spider_threads', 0)
|
||||
|
||||
define_planet_int('feed_timeout', 20)
|
||||
define_planet_int('cache_keep_entries', 10)
|
||||
|
||||
define_planet_list('template_files')
|
||||
define_planet_list('bill_of_materials')
|
||||
|
68
planet/expunge.py
Normal file
68
planet/expunge.py
Normal file
@ -0,0 +1,68 @@
|
||||
""" Expunge old entries from a cache of entries """
|
||||
import glob, os, planet, config, feedparser
|
||||
from xml.dom import minidom
|
||||
from spider import filename
|
||||
|
||||
def expungeCache():
|
||||
""" Expunge old entries from a cache of entries """
|
||||
import planet
|
||||
log = planet.getLogger(config.log_level(),config.log_format())
|
||||
|
||||
log.info("Determining feed subscriptions")
|
||||
entry_count = {}
|
||||
sources = config.cache_sources_directory()
|
||||
for sub in config.subscriptions():
|
||||
data=feedparser.parse(filename(sources,sub))
|
||||
if not data.feed.has_key('id'): continue
|
||||
if config.feed_options(sub).has_key('cache_keep_entries'):
|
||||
entry_count[data.feed.id] = config.feed_options(sub)['cache_keep_entries']
|
||||
else:
|
||||
entry_count[data.feed.id] = config.cache_keep_entries()
|
||||
|
||||
log.info("Listing cached entries")
|
||||
cache = config.cache_directory()
|
||||
dir=[(os.stat(file).st_mtime,file) for file in glob.glob(cache+"/*")
|
||||
if not os.path.isdir(file)]
|
||||
dir.sort()
|
||||
dir.reverse()
|
||||
|
||||
for mtime,file in dir:
|
||||
|
||||
try:
|
||||
entry=minidom.parse(file)
|
||||
# determine source of entry
|
||||
entry.normalize()
|
||||
sources = entry.getElementsByTagName('source')
|
||||
if not sources:
|
||||
# no source determined, do not delete
|
||||
log.debug("No source found for %s", file)
|
||||
continue
|
||||
ids = sources[0].getElementsByTagName('id')
|
||||
if not ids:
|
||||
# feed id not found, do not delete
|
||||
log.debug("No source feed id found for %s", file)
|
||||
continue
|
||||
if ids[0].childNodes[0].nodeValue in entry_count:
|
||||
# subscribed to feed, update entry count
|
||||
entry_count[ids[0].childNodes[0].nodeValue] = entry_count[
|
||||
ids[0].childNodes[0].nodeValue] - 1
|
||||
if entry_count[ids[0].childNodes[0].nodeValue] >= 0:
|
||||
# maximum not reached, do not delete
|
||||
log.debug("Maximum not reached for %s from %s",
|
||||
file, ids[0].childNodes[0].nodeValue)
|
||||
continue
|
||||
else:
|
||||
# maximum reached
|
||||
log.debug("Removing %s, maximum reached for %s",
|
||||
file, ids[0].childNodes[0].nodeValue)
|
||||
else:
|
||||
# not subscribed
|
||||
log.debug("Removing %s, not subscribed to %s",
|
||||
file, ids[0].childNodes[0].nodeValue)
|
||||
# remove old entry
|
||||
#os.unlink(file)
|
||||
|
||||
except:
|
||||
log.error("Error parsing %s", file)
|
||||
|
||||
# end of expungeCache()
|
20
tests/data/expunge/config.ini
Normal file
20
tests/data/expunge/config.ini
Normal file
@ -0,0 +1,20 @@
|
||||
[Planet]
|
||||
name = test planet
|
||||
cache_directory = tests/work/expunge/cache
|
||||
cache_keep_entries = 1
|
||||
|
||||
[tests/data/expunge/testfeed1.atom]
|
||||
name = no source
|
||||
|
||||
[tests/data/expunge/testfeed2.atom]
|
||||
name = no source id
|
||||
|
||||
[tests/data/expunge/testfeed3.atom]
|
||||
name = global setting
|
||||
|
||||
[tests/data/expunge/testfeed4.atom]
|
||||
name = local setting
|
||||
cache_keep_entries = 2
|
||||
|
||||
#[tests/data/expunge/testfeed5.atom]
|
||||
#name = unsubbed
|
8
tests/data/expunge/test1.entry
Normal file
8
tests/data/expunge/test1.entry
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test1/1</id>
|
||||
<link href="http://example.com/1/1"/>
|
||||
<title>Test 1/1</title>
|
||||
<content>Entry with missing source</content>
|
||||
<updated>2007-03-01T01:01:00Z</updated>
|
||||
</entry>
|
11
tests/data/expunge/test2.entry
Normal file
11
tests/data/expunge/test2.entry
Normal file
@ -0,0 +1,11 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test2/1</id>
|
||||
<link href="http://example.com/2/1"/>
|
||||
<title>Test 2/1</title>
|
||||
<content>Entry with missing source id</content>
|
||||
<updated>2007-03-01T02:01:00Z</updated>
|
||||
<source>
|
||||
<title>Test 2/1 source</title>
|
||||
</source>
|
||||
</entry>
|
12
tests/data/expunge/test3a.entry
Normal file
12
tests/data/expunge/test3a.entry
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/1</id>
|
||||
<link href="http://example.com/3/1"/>
|
||||
<title>Test 3/1</title>
|
||||
<content>Entry for global setting 1</content>
|
||||
<updated>2007-03-01T03:01:00Z</updated>
|
||||
<source>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
|
||||
<title>Test 3 source</title>
|
||||
</source>
|
||||
</entry>
|
12
tests/data/expunge/test3b.entry
Normal file
12
tests/data/expunge/test3b.entry
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/2</id>
|
||||
<link href="http://example.com/3/2"/>
|
||||
<title>Test 3/2</title>
|
||||
<content>Entry for global setting 2</content>
|
||||
<updated>2007-03-01T03:02:00Z</updated>
|
||||
<source>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
|
||||
<title>Test 3 source</title>
|
||||
</source>
|
||||
</entry>
|
12
tests/data/expunge/test3c.entry
Normal file
12
tests/data/expunge/test3c.entry
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test3/3</id>
|
||||
<link href="http://example.com/3/3"/>
|
||||
<title>Test 3/3</title>
|
||||
<content>Entry for global setting 3</content>
|
||||
<updated>2007-03-01T03:03:00Z</updated>
|
||||
<source>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
|
||||
<title>Test 3 source</title>
|
||||
</source>
|
||||
</entry>
|
12
tests/data/expunge/test4a.entry
Normal file
12
tests/data/expunge/test4a.entry
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/1</id>
|
||||
<link href="http://example.com/4/1"/>
|
||||
<title>Test 4/1</title>
|
||||
<content>Entry for local setting 1</content>
|
||||
<updated>2007-03-01T04:01:00Z</updated>
|
||||
<source>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
|
||||
<title>Test 4 source</title>
|
||||
</source>
|
||||
</entry>
|
12
tests/data/expunge/test4b.entry
Normal file
12
tests/data/expunge/test4b.entry
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/2</id>
|
||||
<link href="http://example.com/4/2"/>
|
||||
<title>Test 4/2</title>
|
||||
<content>Entry for local setting 2</content>
|
||||
<updated>2007-03-01T04:02:00Z</updated>
|
||||
<source>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
|
||||
<title>Test 4 source</title>
|
||||
</source>
|
||||
</entry>
|
12
tests/data/expunge/test4c.entry
Normal file
12
tests/data/expunge/test4c.entry
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test4/3</id>
|
||||
<link href="http://example.com/4/3"/>
|
||||
<title>Test 4/3</title>
|
||||
<content>Entry for local setting 3</content>
|
||||
<updated>2007-03-01T04:03:00Z</updated>
|
||||
<source>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
|
||||
<title>Test 4 source</title>
|
||||
</source>
|
||||
</entry>
|
12
tests/data/expunge/test5.entry
Normal file
12
tests/data/expunge/test5.entry
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<entry xmlns="http://www.w3.org/2005/Atom">
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-test5/1</id>
|
||||
<link href="http://example.com/5/1"/>
|
||||
<title>Test 5/1</title>
|
||||
<content>Entry from unsubbed feed</content>
|
||||
<updated>2007-03-01T05:01:00Z</updated>
|
||||
<source>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed5</id>
|
||||
<title>Test 5 source</title>
|
||||
</source>
|
||||
</entry>
|
5
tests/data/expunge/testfeed1.atom
Normal file
5
tests/data/expunge/testfeed1.atom
Normal file
@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed1.atom"/>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed1</id>
|
||||
</feed>
|
5
tests/data/expunge/testfeed2.atom
Normal file
5
tests/data/expunge/testfeed2.atom
Normal file
@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed2.atom"/>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed2</id>
|
||||
</feed>
|
5
tests/data/expunge/testfeed3.atom
Normal file
5
tests/data/expunge/testfeed3.atom
Normal file
@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed3.atom"/>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed3</id>
|
||||
</feed>
|
5
tests/data/expunge/testfeed4.atom
Normal file
5
tests/data/expunge/testfeed4.atom
Normal file
@ -0,0 +1,5 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<link rel="self" href="http://bzr.mfd-consult.dk/venus/tests/data/expunge/testfeed4.atom"/>
|
||||
<id>tag:bzr.mfd-consult.dk,2007:venus-expunge-testfeed4</id>
|
||||
</feed>
|
145
tests/test_expunge.py
Normal file
145
tests/test_expunge.py
Normal file
@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
#import unittest, os, glob, calendar, shutil, time
|
||||
#from planet.spider import filename, spiderPlanet, writeCache
|
||||
#from planet import feedparser, config
|
||||
#import planet
|
||||
|
||||
workdir = 'tests/work/expunge/cache'
|
||||
testfeed = 'tests/data/expunge/testfeed%s.atom'
|
||||
configfile = 'tests/data/expunge/config.ini'
|
||||
|
||||
class ExpungeTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# silence errors
|
||||
planet.logger = None
|
||||
planet.getLogger('CRITICAL',None)
|
||||
|
||||
try:
|
||||
os.makedirs(workdir)
|
||||
except:
|
||||
self.tearDown()
|
||||
os.makedirs(workdir)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(workdir)
|
||||
os.removedirs(os.path.split(workdir)[0])
|
||||
|
||||
def test_filename(self):
|
||||
self.assertEqual(os.path.join('.', 'example.com,index.html'),
|
||||
filename('.', 'http://example.com/index.html'))
|
||||
self.assertEqual(os.path.join('.',
|
||||
'planet.intertwingly.net,2006,testfeed1,1'),
|
||||
filename('.', u'tag:planet.intertwingly.net,2006:testfeed1,1'))
|
||||
self.assertEqual(os.path.join('.',
|
||||
'00000000-0000-0000-0000-000000000000'),
|
||||
filename('.', u'urn:uuid:00000000-0000-0000-0000-000000000000'))
|
||||
|
||||
# Requires Python 2.3
|
||||
try:
|
||||
import encodings.idna
|
||||
except:
|
||||
return
|
||||
self.assertEqual(os.path.join('.', 'xn--8ws00zhy3a.com'),
|
||||
filename('.', u'http://www.\u8a79\u59c6\u65af.com/'))
|
||||
|
||||
def spiderFeed(self, feed_uri):
|
||||
feed_info = feedparser.parse('<feed/>')
|
||||
data = feedparser.parse(feed_uri)
|
||||
writeCache(feed_uri, feed_info, data)
|
||||
|
||||
def verify_spiderFeed(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
files.sort()
|
||||
|
||||
# verify that exactly four files + one sources dir were produced
|
||||
self.assertEqual(5, len(files))
|
||||
|
||||
# verify that the file names are as expected
|
||||
self.assertTrue(os.path.join(workdir,
|
||||
'planet.intertwingly.net,2006,testfeed1,1') in files)
|
||||
|
||||
# verify that the file timestamps match atom:updated
|
||||
data = feedparser.parse(files[2])
|
||||
self.assertEqual(['application/atom+xml'], [link.type
|
||||
for link in data.entries[0].source.links if link.rel=='self'])
|
||||
self.assertEqual('one', data.entries[0].source.planet_name)
|
||||
self.assertEqual('2006-01-03T00:00:00Z', data.entries[0].updated)
|
||||
self.assertEqual(os.stat(files[2]).st_mtime,
|
||||
calendar.timegm(data.entries[0].updated_parsed))
|
||||
|
||||
def test_spiderFeed(self):
|
||||
config.load(configfile)
|
||||
self.spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def test_spiderUpdate(self):
|
||||
config.load(configfile)
|
||||
self.spiderFeed(testfeed % '1a')
|
||||
self.spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def verify_spiderPlanet(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
|
||||
# verify that exactly eight files + 1 source dir were produced
|
||||
self.assertEqual(14, len(files))
|
||||
|
||||
# verify that the file names are as expected
|
||||
self.assertTrue(os.path.join(workdir,
|
||||
'planet.intertwingly.net,2006,testfeed1,1') in files)
|
||||
self.assertTrue(os.path.join(workdir,
|
||||
'planet.intertwingly.net,2006,testfeed2,1') in files)
|
||||
|
||||
data = feedparser.parse(workdir +
|
||||
'/planet.intertwingly.net,2006,testfeed3,1')
|
||||
self.assertEqual(['application/rss+xml'], [link.type
|
||||
for link in data.entries[0].source.links if link.rel=='self'])
|
||||
self.assertEqual('three', data.entries[0].source.author_detail.name)
|
||||
self.assertEqual('three', data.entries[0].source['planet_css-id'])
|
||||
|
||||
def test_spiderPlanet(self):
|
||||
config.load(configfile)
|
||||
spiderPlanet()
|
||||
self.verify_spiderPlanet()
|
||||
|
||||
def test_spiderThreads(self):
|
||||
config.load(configfile.replace('config','threaded'))
|
||||
_PORT = config.parser.getint('Planet','test_port')
|
||||
|
||||
log = []
|
||||
from SimpleHTTPServer import SimpleHTTPRequestHandler
|
||||
class TestRequestHandler(SimpleHTTPRequestHandler):
|
||||
def log_message(self, format, *args):
|
||||
log.append(args)
|
||||
|
||||
from threading import Thread
|
||||
class TestServerThread(Thread):
|
||||
def __init__(self):
|
||||
self.ready = 0
|
||||
self.done = 0
|
||||
Thread.__init__(self)
|
||||
def run(self):
|
||||
from BaseHTTPServer import HTTPServer
|
||||
httpd = HTTPServer(('',_PORT), TestRequestHandler)
|
||||
self.ready = 1
|
||||
while not self.done:
|
||||
httpd.handle_request()
|
||||
|
||||
httpd = TestServerThread()
|
||||
httpd.start()
|
||||
while not httpd.ready:
|
||||
time.sleep(0.1)
|
||||
|
||||
try:
|
||||
spiderPlanet()
|
||||
finally:
|
||||
httpd.done = 1
|
||||
import urllib
|
||||
urllib.urlopen('http://127.0.0.1:%d/' % _PORT).read()
|
||||
|
||||
status = [int(rec[1]) for rec in log if str(rec[0]).startswith('GET ')]
|
||||
status.sort()
|
||||
self.assertEqual([200,200,200,200,404], status)
|
||||
|
||||
self.verify_spiderPlanet()
|
Loading…
x
Reference in New Issue
Block a user