diff --git a/planet/idindex.py b/planet/idindex.py new file mode 100644 index 0000000..2a3685f --- /dev/null +++ b/planet/idindex.py @@ -0,0 +1,72 @@ +from glob import glob +import os, sys, dbhash + +if __name__ == '__main__': + rootdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, rootdir) + +from planet.spider import filename +from planet import config + +def open(): + cache = config.cache_directory() + index=os.path.join(cache,'index') + if not os.path.exists(index): return None + return dbhash.open(filename(index, 'id'),'w') + +def destroy(): + from planet import logger as log + cache = config.cache_directory() + index=os.path.join(cache,'index') + if not os.path.exists(index): return None + idindex = filename(index, 'id') + if os.path.exists(idindex): os.unlink(idindex) + os.removedirs(index) + log.info(idindex + " deleted") + +def create(): + import libxml2 + from planet import logger as log + cache = config.cache_directory() + index=os.path.join(cache,'index') + if not os.path.exists(index): os.makedirs(index) + index = dbhash.open(filename(index, 'id'),'c') + + for file in glob(cache+"/*"): + if not os.path.isdir(file): + try: + doc = libxml2.parseFile(file) + ctxt = doc.xpathNewContext() + ctxt.xpathRegisterNs('atom','http://www.w3.org/2005/Atom') + entry = ctxt.xpathEval('/atom:entry/atom:id') + source = ctxt.xpathEval('/atom:entry/atom:source/atom:id') + if entry and source: + index[filename('',entry[0].content)] = source[0].content + doc.freeDoc() + except: + log.error(file) + + log.info(str(len(index.keys())) + " entries indexed") + index.close() + + return open() + +if __name__ == '__main__': + if len(sys.argv) < 2: + print 'Usage: %s [-c|-d]' % sys.argv[0] + sys.exit(1) + + config.load(sys.argv[1]) + + if len(sys.argv) > 2 and sys.argv[2] == '-c': + create() + elif len(sys.argv) > 2 and sys.argv[2] == '-d': + destroy() + else: + from planet import logger as log + index = open() + if index: + log.info(str(len(index.keys())) + " entries indexed") + index.close() + else: + log.info("no entries indexed") diff --git a/planet/spider.py b/planet/spider.py index 0cd5523..42131ae 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -196,6 +196,9 @@ def spiderFeed(feed): # perform user configured scrub operations on the data scrub(feed, data) + from planet import idindex + index = idindex.open() + # write each entry to the cache cache = config.cache_directory() for entry in data.entries: @@ -234,6 +237,13 @@ def spiderFeed(feed): write(output, cache_file) os.utime(cache_file, (mtime, mtime)) + # optionally index + if index != None: + index[filename('', entry.id)] = \ + data.feed.get('id', data.feed.get('link',None)) + + if index: index.close() + # identify inactive feeds if config.activity_threshold(feed): updated = [entry.updated_parsed for entry in data.entries diff --git a/planet/splice.py b/planet/splice.py index ddb11ed..015b4cd 100644 --- a/planet/splice.py +++ b/planet/splice.py @@ -4,6 +4,7 @@ from xml.dom import minidom import planet, config, feedparser, reconstitute, shell from reconstitute import createTextElement, date from spider import filename +from planet import idindex def splice(): """ Splice together a planet from a cache of entries """ @@ -62,9 +63,12 @@ def splice(): reconstitute.source(xdoc.documentElement, data.feed, None, None) feed.appendChild(xdoc.documentElement) + index = idindex.open() + # insert entry information items = 0 for mtime,file in dir: + if index and index[file.split('/')[-1]] not in sub_ids: continue try: entry=minidom.parse(file) @@ -83,6 +87,8 @@ def splice(): except: log.error("Error parsing %s", file) + if index: index.close() + return doc def apply(doc): diff --git a/tests/test_idindex.py b/tests/test_idindex.py new file mode 100644 index 0000000..dda08d6 --- /dev/null +++ b/tests/test_idindex.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +import unittest +from planet import idindex, config, logger + +class idIndexTest(unittest.TestCase): + + def tearDown(self): + idindex.destroy() + + def test_index_spider(self): + import test_spider + config.load(test_spider.configfile) + + index = idindex.create() + self.assertEqual(0, len(index)) + index.close() + + from planet.spider import spiderPlanet + try: + spiderPlanet() + + index = idindex.open() + self.assertEqual(12, len(index)) + self.assertEqual('tag:planet.intertwingly.net,2006:testfeed1', index['planet.intertwingly.net,2006,testfeed1,1']) + self.assertEqual('http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss', index['planet.intertwingly.net,2006,testfeed3,1']) + index.close() + finally: + import os, shutil + shutil.rmtree(test_spider.workdir) + os.removedirs(os.path.split(test_spider.workdir)[0]) + + def test_index_splice(self): + import test_splice + config.load(test_splice.configfile) + index = idindex.create() + + self.assertEqual(12, len(index)) + self.assertEqual('tag:planet.intertwingly.net,2006:testfeed1', index['planet.intertwingly.net,2006,testfeed1,1']) + self.assertEqual('http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss', index['planet.intertwingly.net,2006,testfeed3,1']) + + for key,value in index.items(): + if value.find('testfeed2')>0: index[key] = value[::-1] + index.close() + + from planet.splice import splice + doc = splice() + + self.assertEqual(8,len(doc.getElementsByTagName('entry'))) + self.assertEqual(4,len(doc.getElementsByTagName('planet:source'))) + self.assertEqual(12,len(doc.getElementsByTagName('planet:name'))) + +try: + import libxml2 +except ImportError: + logger.warn("libxml2 is not available => can't test id index") + for method in dir(idIndexTest): + if method.startswith('test_'): delattr(idIndexTest,method)