diff --git a/planet/spider.py b/planet/spider.py index bbabd07..5069081 100644 --- a/planet/spider.py +++ b/planet/spider.py @@ -154,15 +154,28 @@ def writeCache(feed_uri, feed_info, data): from planet import idindex global index if index != None: index = idindex.open() - - # write each entry to the cache - cache = config.cache_directory() + + # select latest entry for each unique id + ids = {} for entry in data.entries: # generate an id, if none is present if not entry.has_key('id') or not entry.id: entry['id'] = reconstitute.id(None, entry) if not entry['id']: continue + # determine updated date for purposes of selection + updated = '' + if entry.has_key('published'): updated=entry.published + if entry.has_key('updated'): updated=entry.updated + + # if not seen or newer than last seen, select it + if updated >= ids.get(entry.id,('',))[0]: + ids[entry.id] = (updated, entry) + + # write each entry to the cache + cache = config.cache_directory() + for updated, entry in ids.values(): + # compute cache file name based on the id cache_file = filename(cache, entry.id) diff --git a/tests/data/spider/testfeed4.atom b/tests/data/spider/testfeed4.atom new file mode 100644 index 0000000..0443aac --- /dev/null +++ b/tests/data/spider/testfeed4.atom @@ -0,0 +1,41 @@ + + + + tag:planet.intertwingly.net,2006:testfeed1 + + Sam Ruby + It’s just data + + Sam Ruby + rubys@intertwingly.net + http://www.intertwingly.net/blog/ + + 2006-06-16T20:15:18-04:00 + + + + tag:planet.intertwingly.net,2006:testfeed4 + + Mercury + one + 2006-01-01T00:00:00Z + + + + tag:planet.intertwingly.net,2006:testfeed4 + + Earth + three + 2006-01-03T00:00:00Z + + + + tag:planet.intertwingly.net,2006:testfeed4 + + Venus + two + 2006-01-02T00:00:00Z + + + + diff --git a/tests/test_spider.py b/tests/test_spider.py index dafbda2..9ffba8e 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -88,6 +88,14 @@ class SpiderTest(unittest.TestCase): self.spiderFeed(testfeed % '1b') self.verify_spiderFeed() + def test_spiderFeedUpdatedEntries(self): + config.load(configfile) + self.spiderFeed(testfeed % '4') + self.assertEqual(2, len(glob.glob(workdir+"/*"))) + data = feedparser.parse(workdir + + '/planet.intertwingly.net,2006,testfeed4') + self.assertEqual(u'three', data.entries[0].content[0].value) + def verify_spiderPlanet(self): files = glob.glob(workdir+"/*")