Select latest entry by id if ids are duplicated in a feed

This commit is contained in:
Sam Ruby 2007-08-19 12:46:51 -04:00
parent 597f0ea2fe
commit 4d603b6bf7
3 changed files with 65 additions and 3 deletions

View File

@ -154,15 +154,28 @@ def writeCache(feed_uri, feed_info, data):
from planet import idindex
global index
if index != None: index = idindex.open()
# write each entry to the cache
cache = config.cache_directory()
# select latest entry for each unique id
ids = {}
for entry in data.entries:
# generate an id, if none is present
if not entry.has_key('id') or not entry.id:
entry['id'] = reconstitute.id(None, entry)
if not entry['id']: continue
# determine updated date for purposes of selection
updated = ''
if entry.has_key('published'): updated=entry.published
if entry.has_key('updated'): updated=entry.updated
# if not seen or newer than last seen, select it
if updated >= ids.get(entry.id,('',))[0]:
ids[entry.id] = (updated, entry)
# write each entry to the cache
cache = config.cache_directory()
for updated, entry in ids.values():
# compute cache file name based on the id
cache_file = filename(cache, entry.id)

View File

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<link rel="self" href="http://intertwingly.net/code/venus/tests/data/spider/testfeed1a.atom"/>
<id>tag:planet.intertwingly.net,2006:testfeed1</id>
<title>Sam Ruby</title>
<subtitle>Its just data</subtitle>
<author>
<name>Sam Ruby</name>
<email>rubys@intertwingly.net</email>
<uri>http://www.intertwingly.net/blog/</uri>
</author>
<updated>2006-06-16T20:15:18-04:00</updated>
<link href="http://www.intertwingly.net/blog/"/>
<entry>
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
<link href="http://example.com/1"/>
<title>Mercury</title>
<content>one</content>
<updated>2006-01-01T00:00:00Z</updated>
</entry>
<entry>
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
<link href="http://example.com/3"/>
<title>Earth</title>
<content>three</content>
<updated>2006-01-03T00:00:00Z</updated>
</entry>
<entry>
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
<link href="http://example.com/2"/>
<title>Venus</title>
<content>two</content>
<updated>2006-01-02T00:00:00Z</updated>
</entry>
</feed>

View File

@ -88,6 +88,14 @@ class SpiderTest(unittest.TestCase):
self.spiderFeed(testfeed % '1b')
self.verify_spiderFeed()
def test_spiderFeedUpdatedEntries(self):
config.load(configfile)
self.spiderFeed(testfeed % '4')
self.assertEqual(2, len(glob.glob(workdir+"/*")))
data = feedparser.parse(workdir +
'/planet.intertwingly.net,2006,testfeed4')
self.assertEqual(u'three', data.entries[0].content[0].value)
def verify_spiderPlanet(self):
files = glob.glob(workdir+"/*")