Select latest entry by id if ids are duplicated in a feed
This commit is contained in:
parent
597f0ea2fe
commit
4d603b6bf7
@ -154,15 +154,28 @@ def writeCache(feed_uri, feed_info, data):
|
||||
from planet import idindex
|
||||
global index
|
||||
if index != None: index = idindex.open()
|
||||
|
||||
# write each entry to the cache
|
||||
cache = config.cache_directory()
|
||||
|
||||
# select latest entry for each unique id
|
||||
ids = {}
|
||||
for entry in data.entries:
|
||||
# generate an id, if none is present
|
||||
if not entry.has_key('id') or not entry.id:
|
||||
entry['id'] = reconstitute.id(None, entry)
|
||||
if not entry['id']: continue
|
||||
|
||||
# determine updated date for purposes of selection
|
||||
updated = ''
|
||||
if entry.has_key('published'): updated=entry.published
|
||||
if entry.has_key('updated'): updated=entry.updated
|
||||
|
||||
# if not seen or newer than last seen, select it
|
||||
if updated >= ids.get(entry.id,('',))[0]:
|
||||
ids[entry.id] = (updated, entry)
|
||||
|
||||
# write each entry to the cache
|
||||
cache = config.cache_directory()
|
||||
for updated, entry in ids.values():
|
||||
|
||||
# compute cache file name based on the id
|
||||
cache_file = filename(cache, entry.id)
|
||||
|
||||
|
41
tests/data/spider/testfeed4.atom
Normal file
41
tests/data/spider/testfeed4.atom
Normal file
@ -0,0 +1,41 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<link rel="self" href="http://intertwingly.net/code/venus/tests/data/spider/testfeed1a.atom"/>
|
||||
<id>tag:planet.intertwingly.net,2006:testfeed1</id>
|
||||
|
||||
<title>Sam Ruby</title>
|
||||
<subtitle>It’s just data</subtitle>
|
||||
<author>
|
||||
<name>Sam Ruby</name>
|
||||
<email>rubys@intertwingly.net</email>
|
||||
<uri>http://www.intertwingly.net/blog/</uri>
|
||||
</author>
|
||||
<updated>2006-06-16T20:15:18-04:00</updated>
|
||||
<link href="http://www.intertwingly.net/blog/"/>
|
||||
|
||||
<entry>
|
||||
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||
<link href="http://example.com/1"/>
|
||||
<title>Mercury</title>
|
||||
<content>one</content>
|
||||
<updated>2006-01-01T00:00:00Z</updated>
|
||||
</entry>
|
||||
|
||||
<entry>
|
||||
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||
<link href="http://example.com/3"/>
|
||||
<title>Earth</title>
|
||||
<content>three</content>
|
||||
<updated>2006-01-03T00:00:00Z</updated>
|
||||
</entry>
|
||||
|
||||
<entry>
|
||||
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||
<link href="http://example.com/2"/>
|
||||
<title>Venus</title>
|
||||
<content>two</content>
|
||||
<updated>2006-01-02T00:00:00Z</updated>
|
||||
</entry>
|
||||
|
||||
</feed>
|
||||
|
@ -88,6 +88,14 @@ class SpiderTest(unittest.TestCase):
|
||||
self.spiderFeed(testfeed % '1b')
|
||||
self.verify_spiderFeed()
|
||||
|
||||
def test_spiderFeedUpdatedEntries(self):
|
||||
config.load(configfile)
|
||||
self.spiderFeed(testfeed % '4')
|
||||
self.assertEqual(2, len(glob.glob(workdir+"/*")))
|
||||
data = feedparser.parse(workdir +
|
||||
'/planet.intertwingly.net,2006,testfeed4')
|
||||
self.assertEqual(u'three', data.entries[0].content[0].value)
|
||||
|
||||
def verify_spiderPlanet(self):
|
||||
files = glob.glob(workdir+"/*")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user