Select latest entry by id if ids are duplicated in a feed
This commit is contained in:
parent
597f0ea2fe
commit
4d603b6bf7
@ -155,14 +155,27 @@ def writeCache(feed_uri, feed_info, data):
|
|||||||
global index
|
global index
|
||||||
if index != None: index = idindex.open()
|
if index != None: index = idindex.open()
|
||||||
|
|
||||||
# write each entry to the cache
|
# select latest entry for each unique id
|
||||||
cache = config.cache_directory()
|
ids = {}
|
||||||
for entry in data.entries:
|
for entry in data.entries:
|
||||||
# generate an id, if none is present
|
# generate an id, if none is present
|
||||||
if not entry.has_key('id') or not entry.id:
|
if not entry.has_key('id') or not entry.id:
|
||||||
entry['id'] = reconstitute.id(None, entry)
|
entry['id'] = reconstitute.id(None, entry)
|
||||||
if not entry['id']: continue
|
if not entry['id']: continue
|
||||||
|
|
||||||
|
# determine updated date for purposes of selection
|
||||||
|
updated = ''
|
||||||
|
if entry.has_key('published'): updated=entry.published
|
||||||
|
if entry.has_key('updated'): updated=entry.updated
|
||||||
|
|
||||||
|
# if not seen or newer than last seen, select it
|
||||||
|
if updated >= ids.get(entry.id,('',))[0]:
|
||||||
|
ids[entry.id] = (updated, entry)
|
||||||
|
|
||||||
|
# write each entry to the cache
|
||||||
|
cache = config.cache_directory()
|
||||||
|
for updated, entry in ids.values():
|
||||||
|
|
||||||
# compute cache file name based on the id
|
# compute cache file name based on the id
|
||||||
cache_file = filename(cache, entry.id)
|
cache_file = filename(cache, entry.id)
|
||||||
|
|
||||||
|
41
tests/data/spider/testfeed4.atom
Normal file
41
tests/data/spider/testfeed4.atom
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<link rel="self" href="http://intertwingly.net/code/venus/tests/data/spider/testfeed1a.atom"/>
|
||||||
|
<id>tag:planet.intertwingly.net,2006:testfeed1</id>
|
||||||
|
|
||||||
|
<title>Sam Ruby</title>
|
||||||
|
<subtitle>It’s just data</subtitle>
|
||||||
|
<author>
|
||||||
|
<name>Sam Ruby</name>
|
||||||
|
<email>rubys@intertwingly.net</email>
|
||||||
|
<uri>http://www.intertwingly.net/blog/</uri>
|
||||||
|
</author>
|
||||||
|
<updated>2006-06-16T20:15:18-04:00</updated>
|
||||||
|
<link href="http://www.intertwingly.net/blog/"/>
|
||||||
|
|
||||||
|
<entry>
|
||||||
|
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||||
|
<link href="http://example.com/1"/>
|
||||||
|
<title>Mercury</title>
|
||||||
|
<content>one</content>
|
||||||
|
<updated>2006-01-01T00:00:00Z</updated>
|
||||||
|
</entry>
|
||||||
|
|
||||||
|
<entry>
|
||||||
|
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||||
|
<link href="http://example.com/3"/>
|
||||||
|
<title>Earth</title>
|
||||||
|
<content>three</content>
|
||||||
|
<updated>2006-01-03T00:00:00Z</updated>
|
||||||
|
</entry>
|
||||||
|
|
||||||
|
<entry>
|
||||||
|
<id>tag:planet.intertwingly.net,2006:testfeed4</id>
|
||||||
|
<link href="http://example.com/2"/>
|
||||||
|
<title>Venus</title>
|
||||||
|
<content>two</content>
|
||||||
|
<updated>2006-01-02T00:00:00Z</updated>
|
||||||
|
</entry>
|
||||||
|
|
||||||
|
</feed>
|
||||||
|
|
@ -88,6 +88,14 @@ class SpiderTest(unittest.TestCase):
|
|||||||
self.spiderFeed(testfeed % '1b')
|
self.spiderFeed(testfeed % '1b')
|
||||||
self.verify_spiderFeed()
|
self.verify_spiderFeed()
|
||||||
|
|
||||||
|
def test_spiderFeedUpdatedEntries(self):
|
||||||
|
config.load(configfile)
|
||||||
|
self.spiderFeed(testfeed % '4')
|
||||||
|
self.assertEqual(2, len(glob.glob(workdir+"/*")))
|
||||||
|
data = feedparser.parse(workdir +
|
||||||
|
'/planet.intertwingly.net,2006,testfeed4')
|
||||||
|
self.assertEqual(u'three', data.entries[0].content[0].value)
|
||||||
|
|
||||||
def verify_spiderPlanet(self):
|
def verify_spiderPlanet(self):
|
||||||
files = glob.glob(workdir+"/*")
|
files = glob.glob(workdir+"/*")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user