This commit is contained in:
Sam Ruby 2006-10-16 14:27:21 -04:00
parent 002471fc68
commit 46ca6cd4f4
4 changed files with 146 additions and 0 deletions

72
planet/idindex.py Normal file
View File

@ -0,0 +1,72 @@
from glob import glob
import os, sys, dbhash
if __name__ == '__main__':
rootdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, rootdir)
from planet.spider import filename
from planet import config
def open():
cache = config.cache_directory()
index=os.path.join(cache,'index')
if not os.path.exists(index): return None
return dbhash.open(filename(index, 'id'),'w')
def destroy():
from planet import logger as log
cache = config.cache_directory()
index=os.path.join(cache,'index')
if not os.path.exists(index): return None
idindex = filename(index, 'id')
if os.path.exists(idindex): os.unlink(idindex)
os.removedirs(index)
log.info(idindex + " deleted")
def create():
import libxml2
from planet import logger as log
cache = config.cache_directory()
index=os.path.join(cache,'index')
if not os.path.exists(index): os.makedirs(index)
index = dbhash.open(filename(index, 'id'),'c')
for file in glob(cache+"/*"):
if not os.path.isdir(file):
try:
doc = libxml2.parseFile(file)
ctxt = doc.xpathNewContext()
ctxt.xpathRegisterNs('atom','http://www.w3.org/2005/Atom')
entry = ctxt.xpathEval('/atom:entry/atom:id')
source = ctxt.xpathEval('/atom:entry/atom:source/atom:id')
if entry and source:
index[filename('',entry[0].content)] = source[0].content
doc.freeDoc()
except:
log.error(file)
log.info(str(len(index.keys())) + " entries indexed")
index.close()
return open()
if __name__ == '__main__':
if len(sys.argv) < 2:
print 'Usage: %s [-c|-d]' % sys.argv[0]
sys.exit(1)
config.load(sys.argv[1])
if len(sys.argv) > 2 and sys.argv[2] == '-c':
create()
elif len(sys.argv) > 2 and sys.argv[2] == '-d':
destroy()
else:
from planet import logger as log
index = open()
if index:
log.info(str(len(index.keys())) + " entries indexed")
index.close()
else:
log.info("no entries indexed")

View File

@ -196,6 +196,9 @@ def spiderFeed(feed):
# perform user configured scrub operations on the data # perform user configured scrub operations on the data
scrub(feed, data) scrub(feed, data)
from planet import idindex
index = idindex.open()
# write each entry to the cache # write each entry to the cache
cache = config.cache_directory() cache = config.cache_directory()
for entry in data.entries: for entry in data.entries:
@ -234,6 +237,13 @@ def spiderFeed(feed):
write(output, cache_file) write(output, cache_file)
os.utime(cache_file, (mtime, mtime)) os.utime(cache_file, (mtime, mtime))
# optionally index
if index != None:
index[filename('', entry.id)] = \
data.feed.get('id', data.feed.get('link',None))
if index: index.close()
# identify inactive feeds # identify inactive feeds
if config.activity_threshold(feed): if config.activity_threshold(feed):
updated = [entry.updated_parsed for entry in data.entries updated = [entry.updated_parsed for entry in data.entries

View File

@ -4,6 +4,7 @@ from xml.dom import minidom
import planet, config, feedparser, reconstitute, shell import planet, config, feedparser, reconstitute, shell
from reconstitute import createTextElement, date from reconstitute import createTextElement, date
from spider import filename from spider import filename
from planet import idindex
def splice(): def splice():
""" Splice together a planet from a cache of entries """ """ Splice together a planet from a cache of entries """
@ -62,9 +63,12 @@ def splice():
reconstitute.source(xdoc.documentElement, data.feed, None, None) reconstitute.source(xdoc.documentElement, data.feed, None, None)
feed.appendChild(xdoc.documentElement) feed.appendChild(xdoc.documentElement)
index = idindex.open()
# insert entry information # insert entry information
items = 0 items = 0
for mtime,file in dir: for mtime,file in dir:
if index and index[file.split('/')[-1]] not in sub_ids: continue
try: try:
entry=minidom.parse(file) entry=minidom.parse(file)
@ -83,6 +87,8 @@ def splice():
except: except:
log.error("Error parsing %s", file) log.error("Error parsing %s", file)
if index: index.close()
return doc return doc
def apply(doc): def apply(doc):

58
tests/test_idindex.py Normal file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
import unittest
from planet import idindex, config, logger
class idIndexTest(unittest.TestCase):
def tearDown(self):
idindex.destroy()
def test_index_spider(self):
import test_spider
config.load(test_spider.configfile)
index = idindex.create()
self.assertEqual(0, len(index))
index.close()
from planet.spider import spiderPlanet
try:
spiderPlanet()
index = idindex.open()
self.assertEqual(12, len(index))
self.assertEqual('tag:planet.intertwingly.net,2006:testfeed1', index['planet.intertwingly.net,2006,testfeed1,1'])
self.assertEqual('http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss', index['planet.intertwingly.net,2006,testfeed3,1'])
index.close()
finally:
import os, shutil
shutil.rmtree(test_spider.workdir)
os.removedirs(os.path.split(test_spider.workdir)[0])
def test_index_splice(self):
import test_splice
config.load(test_splice.configfile)
index = idindex.create()
self.assertEqual(12, len(index))
self.assertEqual('tag:planet.intertwingly.net,2006:testfeed1', index['planet.intertwingly.net,2006,testfeed1,1'])
self.assertEqual('http://intertwingly.net/code/venus/tests/data/spider/testfeed3.rss', index['planet.intertwingly.net,2006,testfeed3,1'])
for key,value in index.items():
if value.find('testfeed2')>0: index[key] = value[::-1]
index.close()
from planet.splice import splice
doc = splice()
self.assertEqual(8,len(doc.getElementsByTagName('entry')))
self.assertEqual(4,len(doc.getElementsByTagName('planet:source')))
self.assertEqual(12,len(doc.getElementsByTagName('planet:name')))
try:
import libxml2
except ImportError:
logger.warn("libxml2 is not available => can't test id index")
for method in dir(idIndexTest):
if method.startswith('test_'): delattr(idIndexTest,method)