This commit is contained in:
Sam Ruby 2006-10-16 14:27:21 -04:00
parent 002471fc68
commit 46ca6cd4f4
4 changed files with 146 additions and 0 deletions

72
planet/idindex.py Normal file
View File

@ -0,0 +1,72 @@
from glob import glob
import os, sys, dbhash
if __name__ == '__main__':
rootdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, rootdir)
from planet.spider import filename
from planet import config
def open():
cache = config.cache_directory()
index=os.path.join(cache,'index')
if not os.path.exists(index): return None
return dbhash.open(filename(index, 'id'),'w')
def destroy():
from planet import logger as log
cache = config.cache_directory()
index=os.path.join(cache,'index')
if not os.path.exists(index): return None
idindex = filename(index, 'id')
if os.path.exists(idindex): os.unlink(idindex)
os.removedirs(index)
log.info(idindex + " deleted")
def create():
import libxml2
from planet import logger as log
cache = config.cache_directory()
index=os.path.join(cache,'index')
if not os.path.exists(index): os.makedirs(index)
index = dbhash.open(filename(index, 'id'),'c')
for file in glob(cache+"/*"):
if not os.path.isdir(file):
try:
doc = libxml2.parseFile(file)
ctxt = doc.xpathNewContext()
ctxt.xpathRegisterNs('atom','http://www.w3.org/2005/Atom')
entry = ctxt.xpathEval('/atom:entry/atom:id')
source = ctxt.xpathEval('/atom:entry/atom:source/atom:id')
if entry and source:
index[filename('',entry[0].content)] = source[0].content
doc.freeDoc()
except:
log.error(file)
log.info(str(len(index.keys())) + " entries indexed")
index.close()
return open()
if __name__ == '__main__':
if len(sys.argv) < 2:
print 'Usage: %s [-c|-d]' % sys.argv[0]
sys.exit(1)
config.load(sys.argv[1])
if len(sys.argv) > 2 and sys.argv[2] == '-c':
create()
elif len(sys.argv) > 2 and sys.argv[2] == '-d':
destroy()
else:
from planet import logger as log
index = open()
if index:
log.info(str(len(index.keys())) + " entries indexed")
index.close()
else:
log.info("no entries indexed")

View File

@ -196,6 +196,9 @@ def spiderFeed(feed):
# perform user configured scrub operations on the data
scrub(feed, data)
from planet import idindex
index = idindex.open()
# write each entry to the cache
cache = config.cache_directory()
for entry in data.entries:
@ -234,6 +237,13 @@ def spiderFeed(feed):
write(output, cache_file)
os.utime(cache_file, (mtime, mtime))
# optionally index
if index != None:
index[filename('', entry.id)] = \
data.feed.get('id', data.feed.get('link',None))
if index: index.close()
# identify inactive feeds
if config.activity_threshold(feed):
updated = [entry.updated_parsed for entry in data.entries

View File

@ -4,6 +4,7 @@ from xml.dom import minidom
import planet, config, feedparser, reconstitute, shell
from reconstitute import createTextElement, date
from spider import filename
from planet import idindex
def splice():
""" Splice together a planet from a cache of entries """
@ -62,9 +63,12 @@ def splice():
reconstitute.source(xdoc.documentElement, data.feed, None, None)
feed.appendChild(xdoc.documentElement)
index = idindex.open()
# insert entry information
items = 0
for mtime,file in dir:
if index and index[file.split('/')[-1]] not in sub_ids: continue
try:
entry=minidom.parse(file)
@ -83,6 +87,8 @@ def splice():
except:
log.error("Error parsing %s", file)
if index: index.close()
return doc
def apply(doc):

58
tests/test_idindex.py Normal file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import unittest
from planet import idindex, config, logger
class idIndexTest(unittest.TestCase):
def tearDown(self):
idindex.destroy()
def test_index_spider(self):
import test_spider
config.load(test_spider.configfile)
index = idindex.create()
self.assertEqual(0, len(index))
index.close()
from planet.spider import spiderPlanet
try:
spiderPlanet()
index = idindex.open()
self.assertEqual(12, len(index))
self.assertEqual('tag:planet.intertwingly.net,2006:testfeed1', index['planet.intertwingly.net,2006,testfeed1,1'])
self.assertEqual('http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss', index['planet.intertwingly.net,2006,testfeed3,1'])
index.close()
finally:
import os, shutil
shutil.rmtree(test_spider.workdir)
os.removedirs(os.path.split(test_spider.workdir)[0])
def test_index_splice(self):
import test_splice
config.load(test_splice.configfile)
index = idindex.create()
self.assertEqual(12, len(index))
self.assertEqual('tag:planet.intertwingly.net,2006:testfeed1', index['planet.intertwingly.net,2006,testfeed1,1'])
self.assertEqual('http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss', index['planet.intertwingly.net,2006,testfeed3,1'])
for key,value in index.items():
if value.find('testfeed2')>0: index[key] = value[::-1]
index.close()
from planet.splice import splice
doc = splice()
self.assertEqual(8,len(doc.getElementsByTagName('entry')))
self.assertEqual(4,len(doc.getElementsByTagName('planet:source')))
self.assertEqual(12,len(doc.getElementsByTagName('planet:name')))
try:
import libxml2
except ImportError:
logger.warn("libxml2 is not available => can't test id index")
for method in dir(idIndexTest):
if method.startswith('test_'): delattr(idIndexTest,method)